# Running on cnode7-012 # Started at Fri Dec 6 23:46:40 CST 2024 # SLURMD_NODENAME=cnode7-012 # SLURM_CLUSTER_NAME=slurm # SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf # SLURM_CPUS_ON_NODE=224 # SLURM_CPUS_PER_TASK=128 # SLURM_EXPORT_ENV=PATH # SLURM_GET_USER_ENV=1 # SLURM_GPUS_ON_NODE=8 # SLURM_GPUS_PER_NODE=8 # SLURM_GTIDS=0 # SLURM_JOBID=6508 # SLURM_JOB_CPUS_PER_NODE='224(x2)' # SLURM_JOB_END_TIME=1765035994 # SLURM_JOB_GID=1026 # SLURM_JOB_GPUS=0,1,2,3,4,5,6,7 # SLURM_JOB_ID=6508 # SLURM_JOB_NAME=exp_owsm/s2t_train_05b_ds_raw_bpe50000/train.log # SLURM_JOB_NODELIST='cnode7-[012-013]' # SLURM_JOB_NUM_NODES=2 # SLURM_JOB_PARTITION=p2 # SLURM_JOB_QOS=normal # SLURM_JOB_START_TIME=1733499994 # SLURM_JOB_UID=1026 # SLURM_JOB_USER=williamchen # SLURM_LOCALID=0 # SLURM_MEM_PER_NODE=2048000 # SLURM_NNODES=2 # SLURM_NODEID=0 # SLURM_NODELIST='cnode7-[012-013]' # SLURM_NODE_ALIASES='(null)' # SLURM_OPEN_MODE=a # SLURM_PRIO_PROCESS=0 # SLURM_PROCID=0 # SLURM_SUBMIT_DIR=/mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1 # SLURM_SUBMIT_HOST=154-T2-P1-NVR # SLURM_TASKS_PER_NODE='1(x2)' # SLURM_TASK_PID=3668364 # SLURM_TOPOLOGY_ADDR=cnode7-012 # SLURM_TOPOLOGY_ADDR_PATTERN=node # SLURM_WORKING_CLUSTER=slurm:154-T2-P1-NVR:6817:9984:109 # srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_fa86b3b9-2a9e-444c-b205-4369d60a2c81 [2024-12-06 23:46:54,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:46:49,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_fa86b3b9-2a9e-444c-b205-4369d60a2c81 /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_fa86b3b9-2a9e-444c-b205-4369d60a2c81 [2024-12-06 23:47:10,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:15,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:15,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:16,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:16,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:16,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:11,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:16,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:16,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:47:16,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [W1206 23:47:18.763075132 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.119481777 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.146427257 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:23.294104699 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.260567270 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.461353755 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:24.688476751 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:24.771958657 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:24.813028558 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.846326418 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.860623658 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:19.921460282 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:24.096829347 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:24.223345117 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1206 23:47:24.224138659 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [cnode7-012:0/16] 2024-12-06 23:47:24,832 (s2t:462) INFO: Vocabulary size: 50002 [W1206 23:47:24.311234988 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [cnode7-012:0/16] 2024-12-06 23:47:28,320 (abs_task:1383) INFO: pytorch.version=2.4.0+cu121, cuda.available=True, cudnn.version=90100, cudnn.benchmark=False, cudnn.deterministic=True [cnode7-012:0/16] 2024-12-06 23:47:28,326 (abs_task:1384) INFO: Model structure: ESPnetS2TModel( (frontend): DefaultFrontend( (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) (frontend): Frontend() (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) ) (specaug): SpecAug( (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) ) (normalize): GlobalMVN(stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) (encoder): TransformerEncoder( (embed): Conv2dSubsampling( (conv): Sequential( (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) (1): ReLU() (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) (3): ReLU() ) (out): Sequential( (0): Linear(in_features=19456, out_features=1024, bias=True) (1): PositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) ) ) (encoders): MultiSequential( (0): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (6): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (7): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (8): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (9): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (10): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (11): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (12): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (13): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (14): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (15): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) ) (decoder): TransformerDecoder( (embed): Sequential( (0): Embedding(50002, 1024) (1): PositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (output_layer): Linear(in_features=1024, out_features=50002, bias=True) (decoders): MultiSequential( (0): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (6): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (7): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (8): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (9): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (10): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (11): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (12): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (13): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (14): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (15): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) (criterion_att): LabelSmoothingLoss( (criterion): KLDivLoss() ) (ctc): CTC( (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) (ctc_loss): CTCLoss() ) ) Model summary: Class Name: ESPnetS2TModel Total Number of model parameters: 653.37 M Number of trainable parameters: 653.37 M (100.0%) Size: 2.61 GB Type: torch.float32 [cnode7-012:0/16] 2024-12-06 23:47:28,326 (abs_task:1387) INFO: Optimizer: Adadelta ( Parameter Group 0 capturable: False differentiable: False eps: 1e-06 foreach: None lr: 1.0 maximize: False rho: 0.9 weight_decay: 0 ) [cnode7-012:0/16] 2024-12-06 23:47:28,326 (abs_task:1388) INFO: Scheduler: None [cnode7-012:0/16] 2024-12-06 23:47:28,331 (abs_task:1397) INFO: Saving the configuration in exp_owsm/s2t_train_05b_ds_raw_bpe50000/config.yaml [cnode7-012:0/16] 2024-12-06 23:47:32,911 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:30,344] [INFO] [comm.py:652:init_distributed] cdb=None /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:30,347] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:35,318] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:30,533] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:30,534] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-06 23:47:35,339] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:30,725] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:30,726] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:35,634] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:35,635] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-06 23:47:30,872] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:30,874] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-06 23:47:35,760] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:35,762] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-06 23:47:30,971] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:30,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-06 23:47:35,773] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:35,774] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-06 23:47:35,808] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:35,809] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [cnode7-012:0/16] 2024-12-06 23:47:35,900 (abs_task:1807) INFO: [valid] dataset: ESPnetDataset( speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"} text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"} text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"} text: {"path": "dump/raw/dev_v3/text", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-06 23:47:35,900 (abs_task:1808) INFO: [valid] Batch sampler: SortedBatchSampler(N-batch=74743, batch_size=16, shape_file=exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-06 23:47:35,906 (abs_task:1809) INFO: [valid] mini-batch sizes summary: N-batch=74743, mean=16.0, min=16, max=17 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:35,913] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:35,914] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [cnode7-012:0/16] 2024-12-06 23:47:35,960 (distributed_utils:129) WARNING: ================================================================= Found OMP_NUM_THREADS=1 in environment variables. With some advanced features, DeepSpeed may have heavy cpu workload so that OMP_NUM_THREADS=1 is not sufficient. Try to increase it in your path.sh ================================================================= [2024-12-06 23:47:35,960] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:35,961] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed info: version=0.15.3, git-hash=unknown, git-branch=unknown [2024-12-06 23:47:35,961] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 cnode7-012:3669398:3669398 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669398:3669398 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669398:3669398 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:3669398:3669398 [0] NCCL INFO cudaDriverVersion 12020 NCCL version 2.20.5+cuda12.4 cnode7-012:3669409:3669409 [4] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669409:3669409 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669409:3669409 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669409:3669409 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation [2024-12-06 23:47:31,275] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:31,276] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 cnode7-013:3148938:3148938 [0] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148938:3148938 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148938:3148938 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148938:3148938 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3148943:3148943 [5] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148943:3148943 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148943:3148943 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148943:3148943 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation [2024-12-06 23:47:31,281] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:31,282] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 cnode7-013:3148944:3148944 [6] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148944:3148944 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148944:3148944 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148944:3148944 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3148939:3148939 [1] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148939:3148939 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148939:3148939 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148939:3148939 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:3669416:3669416 [7] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669416:3669416 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669416:3669416 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669416:3669416 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:3669414:3669414 [5] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669414:3669414 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669414:3669414 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669414:3669414 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:3669401:3669401 [2] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669401:3669401 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669401:3669401 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669401:3669401 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:3669402:3669402 [3] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669402:3669402 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669402:3669402 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669402:3669402 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3148940:3148940 [2] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148940:3148940 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148940:3148940 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148940:3148940 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:3669415:3669415 [6] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669415:3669415 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669415:3669415 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669415:3669415 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3148941:3148941 [3] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148941:3148941 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148941:3148941 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148941:3148941 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3148942:3148942 [4] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148942:3148942 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148942:3148942 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148942:3148942 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( cnode7-012:3669415:3670092 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669398:3670088 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669416:3670093 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669414:3670091 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669401:3670095 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669402:3670094 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669409:3670090 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148940:3149821 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669415:3670092 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669415:3670092 [6] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669415:3670092 [6] NCCL INFO Using network IB cnode7-013:3148938:3149822 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148943:3149823 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669398:3670088 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669398:3670088 [0] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669398:3670088 [0] NCCL INFO Using network IB cnode7-013:3148941:3149829 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148944:3149820 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148939:3149819 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148939:3149819 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148939:3149819 [1] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148939:3149819 [1] NCCL INFO Using network IB cnode7-012:3669416:3670093 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669416:3670093 [7] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669416:3670093 [7] NCCL INFO Using network IB cnode7-013:3148942:3149828 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148942:3149828 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148942:3149828 [4] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148942:3149828 [4] NCCL INFO Using network IB cnode7-012:3669414:3670091 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669414:3670091 [5] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669414:3670091 [5] NCCL INFO Using network IB [2024-12-06 23:47:31,620] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:31,622] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 cnode7-012:3669401:3670095 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669401:3670095 [2] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669401:3670095 [2] NCCL INFO Using network IB cnode7-012:3669402:3670094 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669402:3670094 [3] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669402:3670094 [3] NCCL INFO Using network IB cnode7-012:3669409:3670090 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669409:3670090 [4] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669409:3670090 [4] NCCL INFO Using network IB cnode7-013:3148940:3149821 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148940:3149821 [2] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148940:3149821 [2] NCCL INFO Using network IB cnode7-013:3148938:3149822 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148938:3149822 [0] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148938:3149822 [0] NCCL INFO Using network IB cnode7-013:3148943:3149823 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148943:3149823 [5] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148943:3149823 [5] NCCL INFO Using network IB cnode7-013:3148941:3149829 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148941:3149829 [3] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148941:3149829 [3] NCCL INFO Using network IB cnode7-013:3148944:3149820 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148944:3149820 [6] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148944:3149820 [6] NCCL INFO Using network IB cnode7-013:3148945:3148945 [7] NCCL INFO cudaDriverVersion 12020 cnode7-013:3148945:3148945 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148945:3148945 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:3148945:3148945 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-06 23:47:37,291] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-06 23:47:37,293] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 cnode7-012:3669399:3669399 [1] NCCL INFO cudaDriverVersion 12020 cnode7-012:3669399:3669399 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669399:3669399 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:3669399:3669399 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:3148945:3149895 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:3148945:3149895 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:3148945:3149895 [7] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148945:3149895 [7] NCCL INFO Using network IB cnode7-012:3669399:3670191 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:3669399:3670191 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:3669399:3670191 [1] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669399:3670191 [1] NCCL INFO Using network IB cnode7-012:3669398:3670088 [0] NCCL INFO comm 0x55556fbdc870 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148941:3149829 [3] NCCL INFO comm 0x55558061b500 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148944:3149820 [6] NCCL INFO comm 0x55556e41dbc0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148939:3149819 [1] NCCL INFO comm 0x55557261e6c0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148945:3149895 [7] NCCL INFO comm 0x55558461c870 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669416:3670093 [7] NCCL INFO comm 0x555568c19990 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148942:3149828 [4] NCCL INFO comm 0x555581619730 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669414:3670091 [5] NCCL INFO comm 0x555581106840 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669401:3670095 [2] NCCL INFO comm 0x55558161a910 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669402:3670094 [3] NCCL INFO comm 0x55558161a080 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669399:3670191 [1] NCCL INFO comm 0x55556ca1a510 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669409:3670090 [4] NCCL INFO comm 0x55556ca1a380 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148940:3149821 [2] NCCL INFO comm 0x555560a24400 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669415:3670092 [6] NCCL INFO comm 0x555562a21340 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148938:3149822 [0] NCCL INFO comm 0x555564791160 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x984a2c3e4aa92e48 - Init START cnode7-013:3148943:3149823 [5] NCCL INFO comm 0x55557b01b270 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x984a2c3e4aa92e48 - Init START cnode7-012:3669401:3670095 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669401:3670095 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-012:3669399:3670191 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669399:3670191 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-012:3669409:3670090 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-013:3148940:3149821 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148940:3149821 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-012:3669415:3670092 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-013:3148938:3149822 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148938:3149822 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-013:3148943:3149823 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148943:3149823 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-012:3669398:3670088 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669398:3670088 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-013:3148941:3149829 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148941:3149829 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-013:3148939:3149819 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148939:3149819 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-013:3148944:3149820 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148944:3149820 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-013:3148945:3149895 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148945:3149895 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-012:3669416:3670093 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-013:3148942:3149828 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148942:3149828 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-012:3669414:3670091 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-012:3669414:3670091 [5] NCCL INFO comm 0x555581106840 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS Head 7: 7 15 cnode7-013:3148945:3149895 [7] NCCL INFO comm 0x55558461c870 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-013:3148945:3149895 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1 cnode7-013:3148945:3149895 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148943:3149823 [5] NCCL INFO comm 0x55557b01b270 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-013:3148943:3149823 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12 cnode7-013:3148943:3149823 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148944:3149820 [6] NCCL INFO comm 0x55556e41dbc0 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-013:3148944:3149820 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13 cnode7-013:3148944:3149820 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148940:3149821 [2] NCCL INFO comm 0x555560a24400 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-013:3148940:3149821 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9 cnode7-013:3148940:3149821 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669402:3670094 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669402:3670094 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-012:3669402:3670094 [3] NCCL INFO comm 0x55558161a080 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669402:3670094 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 cnode7-012:3669402:3670094 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669409:3670090 [4] NCCL INFO comm 0x55556ca1a380 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669409:3670090 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 cnode7-012:3669409:3670090 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669401:3670095 [2] NCCL INFO comm 0x55558161a910 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669401:3670095 [2] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669401:3670095 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 cnode7-012:3669401:3670095 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148942:3149828 [4] NCCL INFO comm 0x555581619730 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-013:3148942:3149828 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11 cnode7-013:3148942:3149828 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148939:3149819 [1] NCCL INFO comm 0x55557261e6c0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-013:3148939:3149819 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8 cnode7-013:3148939:3149819 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148938:3149822 [0] NCCL INFO comm 0x555564791160 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-013:3148938:3149822 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15 cnode7-013:3148938:3149822 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148941:3149829 [3] NCCL INFO comm 0x55558061b500 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-013:3148941:3149829 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10 cnode7-013:3148941:3149829 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669415:3670092 [6] NCCL INFO comm 0x555562a21340 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669415:3670092 [6] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669415:3670092 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5 cnode7-012:3669415:3670092 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669398:3670088 [0] NCCL INFO comm 0x55556fbdc870 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:3669398:3670088 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:3669398:3670088 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7 cnode7-012:3669398:3670088 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669416:3670093 [7] NCCL INFO comm 0x555568c19990 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669416:3670093 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15 cnode7-012:3669416:3670093 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669414:3670091 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4 cnode7-012:3669414:3670091 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669399:3670191 [1] NCCL INFO comm 0x55556ca1a510 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669399:3670191 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 cnode7-012:3669399:3670191 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669415:3670092 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Connected all rings cnode7-012:3669398:3670088 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Connected all rings cnode7-013:3148941:3149829 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 09/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 11/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 13/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 15/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Connected all rings cnode7-013:3148939:3149819 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 05/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 13/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Connected all rings cnode7-013:3148944:3149820 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 09/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 11/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 13/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 14/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 08/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 10/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3149820 [6] NCCL INFO Channel 12/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Connected all rings cnode7-013:3148945:3149895 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Connected all rings cnode7-012:3669416:3670093 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Connected all rings cnode7-013:3148942:3149828 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 09/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 11/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 12/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 15/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3149828 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Connected all rings cnode7-012:3669414:3670091 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3670091 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Connected all rings cnode7-012:3669401:3670095 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Connected all rings cnode7-012:3669402:3670094 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3670094 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Connected all rings cnode7-012:3669399:3670191 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3670191 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Connected all rings cnode7-012:3669409:3670090 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3670090 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Connected all rings cnode7-013:3148940:3149821 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 10/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 13/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3149821 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Connected all rings cnode7-012:3669415:3670092 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3670092 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Connected all rings cnode7-013:3148938:3149822 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 13/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3149822 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Connected all rings cnode7-013:3148943:3149823 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 03/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 07/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 11/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 13/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 15/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 08/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 10/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 12/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3149823 [5] NCCL INFO Channel 14/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-012:3669398:3670088 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 10/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3149829 [3] NCCL INFO Channel 14/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 12/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3149819 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 09/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 11/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 13/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 15/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 08/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 10/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 12/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3149895 [7] NCCL INFO Channel 14/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3670093 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669401:3670095 [2] NCCL INFO Connected all trees cnode7-013:3148940:3149821 [2] NCCL INFO Connected all trees cnode7-013:3148941:3149829 [3] NCCL INFO Connected all trees cnode7-012:3669402:3670094 [3] NCCL INFO Connected all trees cnode7-012:3669416:3670093 [7] NCCL INFO Connected all trees cnode7-012:3669399:3670191 [1] NCCL INFO Connected all trees cnode7-013:3148942:3149828 [4] NCCL INFO Connected all trees cnode7-013:3148943:3149823 [5] NCCL INFO Connected all trees cnode7-013:3148938:3149822 [0] NCCL INFO Connected all trees cnode7-013:3148945:3149895 [7] NCCL INFO Connected all trees cnode7-013:3148939:3149819 [1] NCCL INFO Connected all trees cnode7-013:3148944:3149820 [6] NCCL INFO Connected all trees cnode7-012:3669414:3670091 [5] NCCL INFO Connected all trees cnode7-012:3669409:3670090 [4] NCCL INFO Connected all trees cnode7-012:3669398:3670088 [0] NCCL INFO Connected all trees cnode7-012:3669415:3670092 [6] NCCL INFO Connected all trees cnode7-012:3669401:3670095 [2] NCCL INFO NVLS comm 0x55558161a910 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669399:3670191 [1] NCCL INFO NVLS comm 0x55556ca1a510 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669409:3670090 [4] NCCL INFO NVLS comm 0x55556ca1a380 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148940:3149821 [2] NCCL INFO NVLS comm 0x555560a24400 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148938:3149822 [0] NCCL INFO NVLS comm 0x555564791160 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148943:3149823 [5] NCCL INFO NVLS comm 0x55557b01b270 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669398:3670088 [0] NCCL INFO NVLS comm 0x55556fbdc870 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148941:3149829 [3] NCCL INFO NVLS comm 0x55558061b500 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148939:3149819 [1] NCCL INFO NVLS comm 0x55557261e6c0 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148944:3149820 [6] NCCL INFO NVLS comm 0x55556e41dbc0 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148945:3149895 [7] NCCL INFO NVLS comm 0x55558461c870 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669416:3670093 [7] NCCL INFO NVLS comm 0x555568c19990 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148942:3149828 [4] NCCL INFO NVLS comm 0x555581619730 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669414:3670091 [5] NCCL INFO NVLS comm 0x555581106840 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669402:3670094 [3] NCCL INFO NVLS comm 0x55558161a080 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148940:3149821 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO NVLS comm 0x555562a21340 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148938:3149822 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3149822 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3149823 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3149829 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3149819 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3149820 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3149895 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3149828 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3149821 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3670191 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3670090 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3670092 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3670088 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3670093 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3670091 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3670095 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3670094 [3] NCCL INFO Connected NVLS tree cnode7-012:3669402:3670094 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669402:3670094 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669398:3670088 [0] NCCL INFO Connected NVLS tree cnode7-012:3669398:3670088 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669398:3670088 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669401:3670095 [2] NCCL INFO Connected NVLS tree cnode7-012:3669401:3670095 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669401:3670095 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669399:3670191 [1] NCCL INFO Connected NVLS tree cnode7-012:3669399:3670191 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669399:3670191 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669415:3670092 [6] NCCL INFO Connected NVLS tree cnode7-012:3669415:3670092 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669415:3670092 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669409:3670090 [4] NCCL INFO Connected NVLS tree cnode7-012:3669409:3670090 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669409:3670090 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148940:3149821 [2] NCCL INFO Connected NVLS tree cnode7-013:3148940:3149821 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148940:3149821 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148938:3149822 [0] NCCL INFO Connected NVLS tree cnode7-013:3148938:3149822 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148938:3149822 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669416:3670093 [7] NCCL INFO Connected NVLS tree cnode7-012:3669416:3670093 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669416:3670093 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148941:3149829 [3] NCCL INFO Connected NVLS tree cnode7-013:3148941:3149829 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148941:3149829 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148939:3149819 [1] NCCL INFO Connected NVLS tree cnode7-013:3148939:3149819 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148939:3149819 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148944:3149820 [6] NCCL INFO Connected NVLS tree cnode7-013:3148944:3149820 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148944:3149820 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148945:3149895 [7] NCCL INFO Connected NVLS tree cnode7-013:3148945:3149895 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148945:3149895 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148942:3149828 [4] NCCL INFO Connected NVLS tree cnode7-013:3148942:3149828 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148942:3149828 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669414:3670091 [5] NCCL INFO Connected NVLS tree cnode7-012:3669414:3670091 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669414:3670091 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669414:3670091 [5] NCCL INFO comm 0x555581106840 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669401:3670095 [2] NCCL INFO comm 0x55558161a910 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669409:3670090 [4] NCCL INFO comm 0x55556ca1a380 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669399:3670191 [1] NCCL INFO comm 0x55556ca1a510 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669416:3670093 [7] NCCL INFO comm 0x555568c19990 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669398:3670088 [0] NCCL INFO comm 0x55556fbdc870 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669402:3670094 [3] NCCL INFO comm 0x55558161a080 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-012:3669415:3670092 [6] NCCL INFO comm 0x555562a21340 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148943:3149823 [5] NCCL INFO Connected NVLS tree cnode7-013:3148943:3149823 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148943:3149823 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148945:3149895 [7] NCCL INFO comm 0x55558461c870 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148938:3149822 [0] NCCL INFO comm 0x555564791160 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148939:3149819 [1] NCCL INFO comm 0x55557261e6c0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148940:3149821 [2] NCCL INFO comm 0x555560a24400 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148941:3149829 [3] NCCL INFO comm 0x55558061b500 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148942:3149828 [4] NCCL INFO comm 0x555581619730 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148944:3149820 [6] NCCL INFO comm 0x55556e41dbc0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x984a2c3e4aa92e48 - Init COMPLETE cnode7-013:3148943:3149823 [5] NCCL INFO comm 0x55557b01b270 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x984a2c3e4aa92e48 - Init COMPLETE [2024-12-06 23:47:53,579] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False [2024-12-06 23:47:53,589] [INFO] [logging.py:129:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer [2024-12-06 23:47:53,589] [INFO] [logging.py:129:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer [2024-12-06 23:47:53,625] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam [2024-12-06 23:47:53,626] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= [2024-12-06 23:47:53,626] [INFO] [logging.py:129:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer [2024-12-06 23:47:53,626] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 500000000 [2024-12-06 23:47:53,626] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 500000000 [2024-12-06 23:47:53,626] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False [2024-12-06 23:47:53,626] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False [2024-12-06 23:47:50,392] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:50,392] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:50,392] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:50,392] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:50,392] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:50,408] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:55,225] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:55,225] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:55,225] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:55,225] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:55,225] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:55,238] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:55,249] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:50,533] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:55,375] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states [2024-12-06 23:47:55,378] [INFO] [utils.py:782:see_memory_usage] MA 1.37 GB Max_MA 1.45 GB CA 1.45 GB Max_CA 1 GB [2024-12-06 23:47:55,378] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 72.89 GB, percent = 3.6% [2024-12-06 23:47:50,693] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:55,552] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states [2024-12-06 23:47:55,553] [INFO] [utils.py:782:see_memory_usage] MA 1.37 GB Max_MA 1.52 GB CA 1.6 GB Max_CA 2 GB [2024-12-06 23:47:55,553] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 74.47 GB, percent = 3.7% [2024-12-06 23:47:55,553] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized [2024-12-06 23:47:55,746] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer [2024-12-06 23:47:55,746] [INFO] [utils.py:782:see_memory_usage] MA 1.37 GB Max_MA 1.37 GB CA 1.6 GB Max_CA 2 GB [2024-12-06 23:47:55,746] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 76.41 GB, percent = 3.8% [2024-12-06 23:47:55,748] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer [2024-12-06 23:47:55,748] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler [2024-12-06 23:47:55,748] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed LR Scheduler = PiecewiseLinearWarmupLR(warmup_steps_list=[0, 30000, 60000], warmup_lr_list=[0.0, 5e-05, 0.0002]) [2024-12-06 23:47:55,748] [INFO] [logging.py:129:log_dist] [Rank 0] step=0, skipped=0, lr=[np.float64(1.6666666666666667e-09)], mom=[[0.9, 0.98]] [2024-12-06 23:47:55,749] [INFO] [config.py:999:print] DeepSpeedEngine configuration: [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, "number_checkpoints": null, "synchronize_checkpoint_boundary": false, "profile": false } [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] amp_enabled .................. False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] amp_params ................... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, "metric_path": null, "arg_mappings": null, "metric": "throughput", "model_info": null, "results_dir": "autotuning_results", "exps_dir": "autotuning_exps", "overwrite": true, "fast": true, "start_profile_step": 3, "end_profile_step": 5, "tuner_type": "gridsearch", "tuner_early_stopping": 5, "tuner_num_trials": 50, "model_info_path": null, "mp_size": 1, "max_train_batch_size": null, "min_train_batch_size": 1, "max_train_micro_batch_size_per_gpu": 1.024000e+03, "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] bfloat16_enabled ............. True [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] comms_config ................. [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] communication_data_type ...... None [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] dataloader_drop_last ......... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] disable_allgather ............ False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] dump_state ................... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] elasticity_enabled ........... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] flops_profiler_config ........ { "enabled": false, "recompute_fwd_factor": 0.0, "profile_step": 1, "module_depth": -1, "top_modules": 1, "detailed": true, "output_file": null } [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] fp16_auto_cast ............... None [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] fp16_enabled ................. False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] global_rank .................. 0 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] grad_accum_dtype ............. None [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 1 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] gradient_clipping ............ 5.0 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] graph_harvesting ............. False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] load_universal_checkpoint .... False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] loss_scale ................... 1.0 [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] memory_breakdown ............. False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False [2024-12-06 23:47:55,750] [INFO] [config.py:1003:print] mics_shard_size .............. -1 [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, "num_of_version_in_retention": 2, "enable_nebula_load": true, "load_path": null } [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] optimizer_name ............... adam [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 0.0002, 'betas': [0.9, 0.98], 'eps': 1e-06, 'weight_decay': 0.0} [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] pld_enabled .................. False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] pld_params ................... False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] prescale_gradients ........... False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] scheduler_name ............... None [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] scheduler_params ............. None [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] sparse_attention ............. None [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] steps_per_print .............. 1000 [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] train_batch_size ............. 16 [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] use_node_local_storage ....... False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] weight_quantization_config ... None [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] world_size ................... 16 [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] zero_enabled ................. True [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True [2024-12-06 23:47:55,751] [INFO] [config.py:1003:print] zero_optimization_stage ...... 2 [2024-12-06 23:47:55,751] [INFO] [config.py:989:print_user_config] json = { "train_micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 1, "gradient_clipping": 5.0, "bf16": { "enabled": true }, "zero_optimization": { "stage": 2, "contiguous_gradients": true, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5.000000e+08, "allgather_bucket_size": 5.000000e+08 }, "optimizer": { "type": "Adam", "params": { "lr": 0.0002, "betas": [0.9, 0.98], "eps": 1e-06, "weight_decay": 0.0 } }, "wall_clock_breakdown": false, "steps_per_print": 1000 } [cnode7-012:0/16] 2024-12-06 23:47:55,753 (deepspeed_trainer:75) INFO: Resume training from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17 [2024-12-06 23:47:55,754] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,595] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:51,651] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,658] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,668] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,669] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,669] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,671] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,672] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:51,672] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,468] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:56,503] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,505] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,548] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,548] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,549] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,550] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,550] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:56,550] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt... [2024-12-06 23:47:52,363] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,363] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,363] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,364] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,364] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,365] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,365] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:52,366] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,164] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,164] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,214] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,214] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,214] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,214] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,214] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,215] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/mp_rank_00_model_states.pt. [2024-12-06 23:47:57,362] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,362] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,591] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,591] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,618] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,619] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,620] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,621] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,626] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-06 23:47:52,653] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,473] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,477] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,477] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,480] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,480] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,483] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-06 23:47:57,740] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-06 23:47:57,740] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 1 [2024-12-06 23:47:57,875] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-06 23:47:57,875] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 2 [2024-12-06 23:47:57,876] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 1 [2024-12-06 23:47:57,884] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-06 23:47:57,884] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 3 [2024-12-06 23:47:53,089] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,089] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 12 [2024-12-06 23:47:53,089] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,090] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 14 [2024-12-06 23:47:53,108] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,108] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 11 [2024-12-06 23:47:53,126] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,126] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 9 [2024-12-06 23:47:53,126] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,127] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 10 [2024-12-06 23:47:57,931] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 2 [2024-12-06 23:47:53,146] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,146] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 15 [2024-12-06 23:47:57,942] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 3 [2024-12-06 23:47:53,158] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,158] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 13 [2024-12-06 23:47:53,162] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-06 23:47:53,162] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 8 [2024-12-06 23:47:53,167] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 12 [2024-12-06 23:47:53,168] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 11 [2024-12-06 23:47:53,184] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 14 [2024-12-06 23:47:53,189] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 10 [2024-12-06 23:47:53,189] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 9 [2024-12-06 23:47:58,003] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-06 23:47:58,003] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 6 [2024-12-06 23:47:53,217] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 13 [2024-12-06 23:47:58,014] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-06 23:47:58,014] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 0 [2024-12-06 23:47:58,020] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-06 23:47:58,020] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 5 [2024-12-06 23:47:58,050] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-06 23:47:58,050] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 7 [2024-12-06 23:47:58,062] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 6 [2024-12-06 23:47:53,273] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 8 [2024-12-06 23:47:58,072] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 0 [2024-12-06 23:47:58,079] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 5 [2024-12-06 23:47:58,081] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_17/17/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-06 23:47:58,081] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 4 [2024-12-06 23:47:53,288] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 15 [cnode7-012:0/16] 2024-12-06 23:47:58,089 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [2024-12-06 23:47:58,105] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 7 [2024-12-06 23:47:58,140] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 4 [cnode7-012:0/16] 2024-12-06 23:48:24,080 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-06 23:48:40,832 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-06 23:48:40,833 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-06 23:48:40,835 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-06 23:49:02,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:48:58,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:03,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:03,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:48:59,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:03,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:04,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:48:59,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:48:59,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:04,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:04,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:00,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:00,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:00,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:06,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:02,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:48,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:49,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:45,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:45,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:46,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:46,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:47,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:51,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:52,084] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:52,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:52,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:48,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:53,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:49,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:49:51,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:00,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:33,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:35,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:31,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:33,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:34,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:39,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:40,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:40,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:41,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:36,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:41,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:37,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:37,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:38,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:40,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:50:45,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:17,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:20,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:18,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:19,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:21,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:24,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:24,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:24,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:26,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:25,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:29,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:30,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:28,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:29,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:32,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-06 23:51:37,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:3669398:3678322 [0] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669398:3678322 [0] NCCL INFO Using network IB cnode7-013:3148939:3157594 [1] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148939:3157594 [1] NCCL INFO Using network IB cnode7-013:3148945:3157593 [7] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148945:3157593 [7] NCCL INFO Using network IB cnode7-013:3148942:3157592 [4] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148942:3157592 [4] NCCL INFO Using network IB cnode7-012:3669401:3678323 [2] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669401:3678323 [2] NCCL INFO Using network IB cnode7-012:3669399:3678324 [1] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669399:3678324 [1] NCCL INFO Using network IB cnode7-012:3669402:3678325 [3] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669402:3678325 [3] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:3669409:3678467 [4] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669409:3678467 [4] NCCL INFO Using network IB cnode7-013:3148941:3157741 [3] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148941:3157741 [3] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-013:3148938:3157884 [0] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148938:3157884 [0] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:3669415:3678630 [6] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669415:3678630 [6] NCCL INFO Using network IB cnode7-013:3148943:3158039 [5] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148943:3158039 [5] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:3669416:3678983 [7] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669416:3678983 [7] NCCL INFO Using network IB cnode7-012:3669414:3678986 [5] NCCL INFO Using non-device net plugin version 0 cnode7-012:3669414:3678986 [5] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-013:3148940:3158175 [2] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148940:3158175 [2] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:3669401:3678323 [2] NCCL INFO comm 0x5555c0f6fde0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669414:3678986 [5] NCCL INFO comm 0x5555c0f69f40 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669416:3678983 [7] NCCL INFO comm 0x5555c0f47b60 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669402:3678325 [3] NCCL INFO comm 0x5555d4f73cb0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148944:3158740 [6] NCCL INFO Using non-device net plugin version 0 cnode7-013:3148944:3158740 [6] NCCL INFO Using network IB cnode7-013:3148944:3158740 [6] NCCL INFO comm 0x5555c0f57bd0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669409:3678467 [4] NCCL INFO comm 0x5555c0f60cb0 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669415:3678630 [6] NCCL INFO comm 0x5555be7a8e70 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669399:3678324 [1] NCCL INFO comm 0x5555c0f7f7f0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148940:3158175 [2] NCCL INFO comm 0x5555c0f69b10 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x78d5b8d03204da9b - Init START cnode7-012:3669398:3678322 [0] NCCL INFO comm 0x5555b6f5ee30 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148938:3157884 [0] NCCL INFO comm 0x5555b6f5d8f0 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148939:3157594 [1] NCCL INFO comm 0x5555be7c1db0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148943:3158039 [5] NCCL INFO comm 0x5555be7b53f0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148941:3157741 [3] NCCL INFO comm 0x5555c0f78a30 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148945:3157593 [7] NCCL INFO comm 0x5555c0f64350 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148942:3157592 [4] NCCL INFO comm 0x5555be7b40e0 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x78d5b8d03204da9b - Init START cnode7-013:3148945:3157593 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148945:3157593 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-013:3148942:3157592 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148942:3157592 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-013:3148940:3158175 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148940:3158175 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-013:3148938:3157884 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148938:3157884 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-013:3148939:3157594 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148939:3157594 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-013:3148943:3158039 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148943:3158039 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-013:3148941:3157741 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:3148941:3157741 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-013:3148944:3158740 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:3148944:3158740 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-012:3669398:3678322 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669398:3678322 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-012:3669401:3678323 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669401:3678323 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-012:3669414:3678986 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-012:3669416:3678983 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-012:3669402:3678325 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669402:3678325 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-013:3148944:3158740 [6] NCCL INFO comm 0x5555c0f57bd0 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-013:3148944:3158740 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13 cnode7-013:3148944:3158740 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669402:3678325 [3] NCCL INFO comm 0x5555d4f73cb0 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669402:3678325 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 cnode7-012:3669402:3678325 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669399:3678324 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:3669399:3678324 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-012:3669399:3678324 [1] NCCL INFO comm 0x5555c0f7f7f0 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669399:3678324 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 cnode7-012:3669399:3678324 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669398:3678322 [0] NCCL INFO comm 0x5555b6f5ee30 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669398:3678322 [0] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:3669398:3678322 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7 cnode7-012:3669398:3678322 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669401:3678323 [2] NCCL INFO comm 0x5555c0f6fde0 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669401:3678323 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 cnode7-012:3669401:3678323 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669409:3678467 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-012:3669409:3678467 [4] NCCL INFO comm 0x5555c0f60cb0 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669409:3678467 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 cnode7-012:3669409:3678467 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669416:3678983 [7] NCCL INFO comm 0x5555c0f47b60 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669416:3678983 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15 cnode7-012:3669416:3678983 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669415:3678630 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-012:3669415:3678630 [6] NCCL INFO comm 0x5555be7a8e70 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669415:3678630 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5 cnode7-012:3669415:3678630 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669414:3678986 [5] NCCL INFO comm 0x5555c0f69f40 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 0: 0 8 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 1: 1 9 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 2: 2 10 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 3: 3 11 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 4: 4 12 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 5: 5 13 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 6: 6 14 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS Head 7: 7 15 cnode7-012:3669414:3678986 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4 cnode7-012:3669414:3678986 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148940:3158175 [2] NCCL INFO comm 0x5555c0f69b10 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-013:3148940:3158175 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9 cnode7-013:3148940:3158175 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148938:3157884 [0] NCCL INFO comm 0x5555b6f5d8f0 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-013:3148938:3157884 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15 cnode7-013:3148938:3157884 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148941:3157741 [3] NCCL INFO comm 0x5555c0f78a30 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-013:3148941:3157741 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10 cnode7-013:3148941:3157741 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148945:3157593 [7] NCCL INFO comm 0x5555c0f64350 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-013:3148945:3157593 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1 cnode7-013:3148945:3157593 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148943:3158039 [5] NCCL INFO comm 0x5555be7b53f0 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-013:3148943:3158039 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12 cnode7-013:3148943:3158039 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148939:3157594 [1] NCCL INFO comm 0x5555be7c1db0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-013:3148939:3157594 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8 cnode7-013:3148939:3157594 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-013:3148942:3157592 [4] NCCL INFO comm 0x5555be7b40e0 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-013:3148942:3157592 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11 cnode7-013:3148942:3157592 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:3669398:3678322 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Connected all rings cnode7-012:3669414:3678986 [5] NCCL INFO Connected all rings cnode7-012:3669416:3678983 [7] NCCL INFO Connected all rings cnode7-012:3669402:3678325 [3] NCCL INFO Connected all rings cnode7-013:3148944:3158740 [6] NCCL INFO Connected all rings cnode7-012:3669409:3678467 [4] NCCL INFO Connected all rings cnode7-012:3669409:3678467 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669409:3678467 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Connected all rings cnode7-012:3669415:3678630 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669415:3678630 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Connected all rings cnode7-012:3669399:3678324 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:3669399:3678324 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Connected all rings cnode7-013:3148940:3158175 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 10/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 13/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:3148940:3158175 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Connected all rings cnode7-012:3669398:3678322 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:3669398:3678322 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Connected all rings cnode7-013:3148938:3157884 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 13/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:3148938:3157884 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Connected all rings cnode7-013:3148939:3157594 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 05/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 13/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 12/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148939:3157594 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Connected all rings cnode7-013:3148943:3158039 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 03/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 07/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 11/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 13/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 15/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 08/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 10/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 12/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148943:3158039 [5] NCCL INFO Channel 14/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Connected all rings cnode7-013:3148941:3157741 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 09/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 11/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 13/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 15/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 10/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Channel 14/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Connected all rings cnode7-013:3148945:3157593 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 09/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 11/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 13/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 15/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 08/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 10/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 12/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148945:3157593 [7] NCCL INFO Channel 14/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Connected all rings cnode7-013:3148942:3157592 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 09/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 11/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 12/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 15/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:3148942:3157592 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669401:3678323 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669414:3678986 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669416:3678983 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:3669402:3678325 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 09/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 11/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 13/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 14/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 08/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 10/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148944:3158740 [6] NCCL INFO Channel 12/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:3148941:3157741 [3] NCCL INFO Connected all trees cnode7-012:3669402:3678325 [3] NCCL INFO Connected all trees cnode7-012:3669401:3678323 [2] NCCL INFO Connected all trees cnode7-013:3148940:3158175 [2] NCCL INFO Connected all trees cnode7-012:3669414:3678986 [5] NCCL INFO Connected all trees cnode7-013:3148944:3158740 [6] NCCL INFO Connected all trees cnode7-013:3148939:3157594 [1] NCCL INFO Connected all trees cnode7-013:3148945:3157593 [7] NCCL INFO Connected all trees cnode7-013:3148943:3158039 [5] NCCL INFO Connected all trees cnode7-012:3669398:3678322 [0] NCCL INFO Connected all trees cnode7-012:3669415:3678630 [6] NCCL INFO Connected all trees cnode7-012:3669416:3678983 [7] NCCL INFO Connected all trees cnode7-012:3669409:3678467 [4] NCCL INFO Connected all trees cnode7-013:3148938:3157884 [0] NCCL INFO Connected all trees cnode7-013:3148942:3157592 [4] NCCL INFO Connected all trees cnode7-013:3148941:3157741 [3] NCCL INFO NVLS comm 0x5555c0f78a30 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669399:3678324 [1] NCCL INFO Connected all trees cnode7-013:3148945:3157593 [7] NCCL INFO NVLS comm 0x5555c0f64350 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148942:3157592 [4] NCCL INFO NVLS comm 0x5555be7b40e0 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669401:3678323 [2] NCCL INFO NVLS comm 0x5555c0f6fde0 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669414:3678986 [5] NCCL INFO NVLS comm 0x5555c0f69f40 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669416:3678983 [7] NCCL INFO NVLS comm 0x5555c0f47b60 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669402:3678325 [3] NCCL INFO NVLS comm 0x5555d4f73cb0 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148944:3158740 [6] NCCL INFO NVLS comm 0x5555c0f57bd0 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669409:3678467 [4] NCCL INFO NVLS comm 0x5555c0f60cb0 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669415:3678630 [6] NCCL INFO NVLS comm 0x5555be7a8e70 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:3669399:3678324 [1] NCCL INFO NVLS comm 0x5555c0f7f7f0 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148940:3158175 [2] NCCL INFO NVLS comm 0x5555c0f69b10 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148940:3158175 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:3148940:3158175 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO NVLS comm 0x5555b6f5ee30 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148938:3157884 [0] NCCL INFO NVLS comm 0x5555b6f5d8f0 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148938:3157884 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148938:3157884 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO NVLS comm 0x5555be7c1db0 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148939:3157594 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148939:3157594 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO NVLS comm 0x5555be7b53f0 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:3148943:3158039 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148943:3158039 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148941:3157741 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148945:3157593 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:3148942:3157592 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669414:3678986 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669416:3678983 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:3669402:3678325 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:3148944:3158740 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669409:3678467 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669415:3678630 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669399:3678324 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669398:3678322 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:3669401:3678323 [2] NCCL INFO Connected NVLS tree cnode7-012:3669401:3678323 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669401:3678323 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148940:3158175 [2] NCCL INFO Connected NVLS tree cnode7-013:3148940:3158175 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148940:3158175 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669398:3678322 [0] NCCL INFO Connected NVLS tree cnode7-012:3669398:3678322 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669398:3678322 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148938:3157884 [0] NCCL INFO Connected NVLS tree cnode7-013:3148938:3157884 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148938:3157884 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148939:3157594 [1] NCCL INFO Connected NVLS tree cnode7-013:3148939:3157594 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148939:3157594 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148943:3158039 [5] NCCL INFO Connected NVLS tree cnode7-013:3148943:3158039 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148943:3158039 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148941:3157741 [3] NCCL INFO Connected NVLS tree cnode7-013:3148941:3157741 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148941:3157741 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669414:3678986 [5] NCCL INFO Connected NVLS tree cnode7-012:3669414:3678986 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669414:3678986 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669402:3678325 [3] NCCL INFO Connected NVLS tree cnode7-012:3669402:3678325 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669402:3678325 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669399:3678324 [1] NCCL INFO Connected NVLS tree cnode7-012:3669399:3678324 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669399:3678324 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669416:3678983 [7] NCCL INFO Connected NVLS tree cnode7-012:3669416:3678983 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669416:3678983 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669409:3678467 [4] NCCL INFO Connected NVLS tree cnode7-012:3669409:3678467 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669409:3678467 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148945:3157593 [7] NCCL INFO Connected NVLS tree cnode7-013:3148945:3157593 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148945:3157593 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:3148942:3157592 [4] NCCL INFO Connected NVLS tree cnode7-013:3148942:3157592 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148942:3157592 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669415:3678630 [6] NCCL INFO Connected NVLS tree cnode7-012:3669415:3678630 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:3669415:3678630 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669401:3678323 [2] NCCL INFO comm 0x5555c0f6fde0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-012:3669414:3678986 [5] NCCL INFO comm 0x5555c0f69f40 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-012:3669416:3678983 [7] NCCL INFO comm 0x5555c0f47b60 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-012:3669402:3678325 [3] NCCL INFO comm 0x5555d4f73cb0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148944:3158740 [6] NCCL INFO Connected NVLS tree cnode7-013:3148944:3158740 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:3148944:3158740 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:3669409:3678467 [4] NCCL INFO comm 0x5555c0f60cb0 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-012:3669415:3678630 [6] NCCL INFO comm 0x5555be7a8e70 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-012:3669399:3678324 [1] NCCL INFO comm 0x5555c0f7f7f0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-012:3669398:3678322 [0] NCCL INFO comm 0x5555b6f5ee30 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148938:3157884 [0] NCCL INFO comm 0x5555b6f5d8f0 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148939:3157594 [1] NCCL INFO comm 0x5555be7c1db0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148943:3158039 [5] NCCL INFO comm 0x5555be7b53f0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148941:3157741 [3] NCCL INFO comm 0x5555c0f78a30 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148942:3157592 [4] NCCL INFO comm 0x5555be7b40e0 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148944:3158740 [6] NCCL INFO comm 0x5555c0f57bd0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148940:3158175 [2] NCCL INFO comm 0x5555c0f69b10 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x78d5b8d03204da9b - Init COMPLETE cnode7-013:3148945:3157593 [7] NCCL INFO comm 0x5555c0f64350 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x78d5b8d03204da9b - Init COMPLETE [cnode7-012:0/16] 2024-12-06 23:52:51,711 (deepspeed_trainer:228) INFO: 18epoch:train:1-100batch: iter_time=2.355, loss_ctc=78.160, loss_att=54.791, acc=0.699, loss=61.770, grad_norm=4.293, loss_scale=1.000, learning_rate=9.700e-05, step_time=0.610 [cnode7-012:0/16] 2024-12-06 23:53:28,911 (deepspeed_trainer:228) INFO: 18epoch:train:101-200batch: iter_time=1.239e-04, loss_ctc=67.691, loss_att=50.629, acc=0.701, loss=55.755, grad_norm=4.286, loss_scale=1.000, learning_rate=9.699e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-06 23:54:06,110 (deepspeed_trainer:228) INFO: 18epoch:train:201-300batch: iter_time=1.227e-04, loss_ctc=91.796, loss_att=59.674, acc=0.699, loss=69.330, grad_norm=5.627, loss_scale=1.000, learning_rate=9.697e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-06 23:54:43,241 (deepspeed_trainer:228) INFO: 18epoch:train:301-400batch: iter_time=1.225e-04, loss_ctc=97.493, loss_att=63.454, acc=0.678, loss=73.674, grad_norm=8.733, loss_scale=1.000, learning_rate=9.695e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-06 23:55:20,431 (deepspeed_trainer:228) INFO: 18epoch:train:401-500batch: iter_time=1.223e-04, loss_ctc=76.089, loss_att=58.533, acc=0.697, loss=63.788, grad_norm=4.625, loss_scale=1.000, learning_rate=9.693e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-06 23:55:57,592 (deepspeed_trainer:228) INFO: 18epoch:train:501-600batch: iter_time=1.279e-04, loss_ctc=91.797, loss_att=64.756, acc=0.688, loss=72.877, grad_norm=5.525, loss_scale=1.000, learning_rate=9.691e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-06 23:56:34,451 (deepspeed_trainer:228) INFO: 18epoch:train:601-700batch: iter_time=1.342e-04, loss_ctc=82.094, loss_att=58.568, acc=0.697, loss=65.632, grad_norm=5.098, loss_scale=1.000, learning_rate=9.689e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-06 23:57:11,980 (deepspeed_trainer:228) INFO: 18epoch:train:701-800batch: iter_time=1.461e-04, loss_ctc=77.086, loss_att=63.283, acc=0.674, loss=67.424, grad_norm=4.452, loss_scale=1.000, learning_rate=9.687e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-06 23:57:48,975 (deepspeed_trainer:228) INFO: 18epoch:train:801-900batch: iter_time=1.562e-04, loss_ctc=68.453, loss_att=50.476, acc=0.685, loss=55.861, grad_norm=4.279, loss_scale=1.000, learning_rate=9.685e-05, step_time=0.370 [2024-12-06 23:58:25,435] [INFO] [logging.py:129:log_dist] [Rank 0] step=256000, skipped=0, lr=[np.float64(9.682439454522451e-05)], mom=[[0.9, 0.98]] [2024-12-06 23:58:25,435] [INFO] [timer.py:264:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=44.16316122204052, CurrSamplesPerSec=43.62400389748318, MemAllocated=1.77GB, MaxMemAllocated=15.25GB [cnode7-012:0/16] 2024-12-06 23:58:25,437 (deepspeed_trainer:228) INFO: 18epoch:train:901-1000batch: iter_time=1.351e-04, loss_ctc=68.254, loss_att=48.783, acc=0.679, loss=54.628, grad_norm=4.504, loss_scale=1.000, learning_rate=9.683e-05, step_time=0.364 [cnode7-012:0/16] 2024-12-06 23:59:02,276 (deepspeed_trainer:228) INFO: 18epoch:train:1001-1100batch: iter_time=1.306e-04, loss_ctc=86.221, loss_att=55.954, acc=0.687, loss=65.010, grad_norm=5.159, loss_scale=1.000, learning_rate=9.681e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-06 23:59:39,333 (deepspeed_trainer:228) INFO: 18epoch:train:1101-1200batch: iter_time=1.249e-04, loss_ctc=71.517, loss_att=60.617, acc=0.683, loss=63.889, grad_norm=4.388, loss_scale=1.000, learning_rate=9.680e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 00:00:16,776 (deepspeed_trainer:228) INFO: 18epoch:train:1201-1300batch: iter_time=1.298e-04, loss_ctc=81.622, loss_att=53.365, acc=0.709, loss=61.864, grad_norm=4.331, loss_scale=1.000, learning_rate=9.678e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 00:00:53,518 (deepspeed_trainer:228) INFO: 18epoch:train:1301-1400batch: iter_time=1.304e-04, loss_ctc=71.151, loss_att=56.526, acc=0.680, loss=60.917, grad_norm=4.278, loss_scale=1.000, learning_rate=9.676e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 00:01:30,525 (deepspeed_trainer:228) INFO: 18epoch:train:1401-1500batch: iter_time=1.231e-04, loss_ctc=84.170, loss_att=61.862, acc=0.689, loss=68.549, grad_norm=4.346, loss_scale=1.000, learning_rate=9.674e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:02:07,527 (deepspeed_trainer:228) INFO: 18epoch:train:1501-1600batch: iter_time=1.264e-04, loss_ctc=83.272, loss_att=58.079, acc=0.696, loss=65.619, grad_norm=4.761, loss_scale=1.000, learning_rate=9.672e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:02:44,390 (deepspeed_trainer:228) INFO: 18epoch:train:1601-1700batch: iter_time=1.286e-04, loss_ctc=74.106, loss_att=55.015, acc=0.684, loss=60.734, grad_norm=4.707, loss_scale=1.000, learning_rate=9.670e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 00:03:21,310 (deepspeed_trainer:228) INFO: 18epoch:train:1701-1800batch: iter_time=1.303e-04, loss_ctc=78.953, loss_att=60.398, acc=0.688, loss=65.964, grad_norm=4.700, loss_scale=1.000, learning_rate=9.668e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:03:52,861 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 00:04:19,140 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 00:04:35,172 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 00:04:35,172 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 00:04:35,175 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 00:05:00,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:04:59,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:01,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:02,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:04:59,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:04:59,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:02,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:00,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:02,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:00,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:02,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:00,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:03,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:06,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:10,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:10,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:47,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:50,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:48,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:50,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:48,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:50,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:51,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:48,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:49,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:49,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:52,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:52,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:51,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:54,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:57,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:05:57,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:38,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:36,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:36,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:37,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:37,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:39,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:37,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:37,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:40,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:41,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:39,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:42,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:41,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:44,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:45,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:06:45,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:26,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:24,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:27,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:25,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:25,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:27,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:28,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:26,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:26,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:27,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:27,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:30,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:28,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:31,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:32,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:07:36,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 00:08:11,887 (deepspeed_trainer:228) INFO: 18epoch:train:1801-1900batch: iter_time=2.496, loss_ctc=68.709, loss_att=47.306, acc=0.702, loss=53.719, grad_norm=4.079, loss_scale=1.000, learning_rate=9.666e-05, step_time=0.409 [2024-12-07 00:08:50,230] [INFO] [logging.py:129:log_dist] [Rank 0] step=257000, skipped=0, lr=[np.float64(9.663583737030714e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:08:50,231] [INFO] [timer.py:264:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=43.94247926021959, CurrSamplesPerSec=44.59522404645903, MemAllocated=1.77GB, MaxMemAllocated=15.25GB [cnode7-012:0/16] 2024-12-07 00:08:50,233 (deepspeed_trainer:228) INFO: 18epoch:train:1901-2000batch: iter_time=1.657e-04, loss_ctc=73.408, loss_att=53.511, acc=0.704, loss=59.484, grad_norm=4.188, loss_scale=1.000, learning_rate=9.665e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 00:09:28,654 (deepspeed_trainer:228) INFO: 18epoch:train:2001-2100batch: iter_time=1.600e-04, loss_ctc=68.923, loss_att=54.584, acc=0.705, loss=58.865, grad_norm=4.013, loss_scale=1.000, learning_rate=9.663e-05, step_time=0.384 [cnode7-012:0/16] 2024-12-07 00:10:06,028 (deepspeed_trainer:228) INFO: 18epoch:train:2101-2200batch: iter_time=1.229e-04, loss_ctc=96.610, loss_att=62.445, acc=0.698, loss=72.689, grad_norm=7.113, loss_scale=1.000, learning_rate=9.661e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 00:10:43,275 (deepspeed_trainer:228) INFO: 18epoch:train:2201-2300batch: iter_time=1.050e-04, loss_ctc=73.676, loss_att=62.285, acc=0.689, loss=65.706, grad_norm=4.594, loss_scale=1.000, learning_rate=9.659e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 00:11:20,615 (deepspeed_trainer:228) INFO: 18epoch:train:2301-2400batch: iter_time=1.073e-04, loss_ctc=78.012, loss_att=59.430, acc=0.704, loss=65.011, grad_norm=4.673, loss_scale=1.000, learning_rate=9.657e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 00:11:57,562 (deepspeed_trainer:228) INFO: 18epoch:train:2401-2500batch: iter_time=1.096e-04, loss_ctc=95.953, loss_att=69.110, acc=0.687, loss=77.173, grad_norm=5.696, loss_scale=1.000, learning_rate=9.655e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:12:34,281 (deepspeed_trainer:228) INFO: 18epoch:train:2501-2600batch: iter_time=1.099e-04, loss_ctc=77.642, loss_att=58.664, acc=0.698, loss=64.305, grad_norm=4.862, loss_scale=1.000, learning_rate=9.653e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 00:13:10,880 (deepspeed_trainer:228) INFO: 18epoch:train:2601-2700batch: iter_time=1.105e-04, loss_ctc=70.903, loss_att=58.880, acc=0.685, loss=62.491, grad_norm=4.579, loss_scale=1.000, learning_rate=9.651e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 00:13:47,003 (deepspeed_trainer:228) INFO: 18epoch:train:2701-2800batch: iter_time=1.095e-04, loss_ctc=70.794, loss_att=52.643, acc=0.690, loss=58.082, grad_norm=4.836, loss_scale=1.000, learning_rate=9.650e-05, step_time=0.361 [cnode7-012:0/16] 2024-12-07 00:14:23,284 (deepspeed_trainer:228) INFO: 18epoch:train:2801-2900batch: iter_time=1.067e-04, loss_ctc=66.467, loss_att=46.675, acc=0.685, loss=52.617, grad_norm=4.326, loss_scale=1.000, learning_rate=9.648e-05, step_time=0.362 [2024-12-07 00:14:59,941] [INFO] [logging.py:129:log_dist] [Rank 0] step=258000, skipped=0, lr=[np.float64(9.644837751880323e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:14:59,942] [INFO] [timer.py:264:stop] epoch=0/micro_step=3000/global_step=3000, RunningAvgSamplesPerSec=44.10368463137007, CurrSamplesPerSec=46.81109569856297, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 00:14:59,944 (deepspeed_trainer:228) INFO: 18epoch:train:2901-3000batch: iter_time=1.106e-04, loss_ctc=79.004, loss_att=53.480, acc=0.698, loss=61.119, grad_norm=5.134, loss_scale=1.000, learning_rate=9.646e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 00:15:36,998 (deepspeed_trainer:228) INFO: 18epoch:train:3001-3100batch: iter_time=1.109e-04, loss_ctc=75.449, loss_att=64.317, acc=0.691, loss=67.664, grad_norm=4.187, loss_scale=1.000, learning_rate=9.644e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:16:14,265 (deepspeed_trainer:228) INFO: 18epoch:train:3101-3200batch: iter_time=1.141e-04, loss_ctc=85.131, loss_att=56.580, acc=0.702, loss=65.108, grad_norm=4.742, loss_scale=1.000, learning_rate=9.642e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 00:16:51,079 (deepspeed_trainer:228) INFO: 18epoch:train:3201-3300batch: iter_time=1.130e-04, loss_ctc=66.024, loss_att=52.546, acc=0.701, loss=56.580, grad_norm=4.299, loss_scale=1.000, learning_rate=9.640e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 00:17:28,332 (deepspeed_trainer:228) INFO: 18epoch:train:3301-3400batch: iter_time=1.136e-04, loss_ctc=82.468, loss_att=63.794, acc=0.694, loss=69.398, grad_norm=4.701, loss_scale=1.000, learning_rate=9.638e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 00:18:05,276 (deepspeed_trainer:228) INFO: 18epoch:train:3401-3500batch: iter_time=1.135e-04, loss_ctc=81.101, loss_att=55.995, acc=0.705, loss=63.508, grad_norm=4.524, loss_scale=1.000, learning_rate=9.636e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:18:42,531 (deepspeed_trainer:228) INFO: 18epoch:train:3501-3600batch: iter_time=1.097e-04, loss_ctc=75.413, loss_att=57.948, acc=0.696, loss=63.208, grad_norm=4.754, loss_scale=1.000, learning_rate=9.635e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 00:19:19,697 (deepspeed_trainer:228) INFO: 18epoch:train:3601-3700batch: iter_time=1.081e-04, loss_ctc=76.149, loss_att=56.138, acc=0.695, loss=62.112, grad_norm=4.775, loss_scale=1.000, learning_rate=9.633e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 00:19:41,342 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 00:20:07,535 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 00:20:22,869 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 00:20:22,869 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 00:20:22,871 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 00:20:49,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:49,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:48,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:48,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:51,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:49,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:49,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:52,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:52,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:20:50,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:37,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:37,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:36,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:36,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:39,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:39,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:37,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:37,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:37,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:40,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:40,603] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:38,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:40,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:40,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:38,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:21:39,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:24,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:25,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:23,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:24,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:24,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:27,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:24,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:27,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:25,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:25,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:26,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:28,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:29,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:26,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:29,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:22:30,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:11,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:10,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:13,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:11,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:12,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:12,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:12,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:13,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:16,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:16,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:14,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:16,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:14,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:17,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:17,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:23:19,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 00:24:04,464 (deepspeed_trainer:228) INFO: 18epoch:train:3701-3800batch: iter_time=2.478, loss_ctc=68.123, loss_att=46.945, acc=0.709, loss=53.288, grad_norm=4.265, loss_scale=1.000, learning_rate=9.631e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:24:41,456 (deepspeed_trainer:228) INFO: 18epoch:train:3801-3900batch: iter_time=1.094e-04, loss_ctc=74.032, loss_att=53.900, acc=0.702, loss=59.911, grad_norm=4.791, loss_scale=1.000, learning_rate=9.629e-05, step_time=0.369 [2024-12-07 00:25:18,492] [INFO] [logging.py:129:log_dist] [Rank 0] step=259000, skipped=0, lr=[np.float64(9.626200438857197e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:25:18,493] [INFO] [timer.py:264:stop] epoch=0/micro_step=4000/global_step=4000, RunningAvgSamplesPerSec=44.142191497076375, CurrSamplesPerSec=45.301672158879136, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 00:25:18,494 (deepspeed_trainer:228) INFO: 18epoch:train:3901-4000batch: iter_time=1.090e-04, loss_ctc=65.778, loss_att=50.639, acc=0.704, loss=55.193, grad_norm=4.400, loss_scale=1.000, learning_rate=9.627e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:25:55,649 (deepspeed_trainer:228) INFO: 18epoch:train:4001-4100batch: iter_time=1.134e-04, loss_ctc=97.241, loss_att=65.543, acc=0.687, loss=75.060, grad_norm=6.591, loss_scale=1.000, learning_rate=9.625e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 00:26:32,266 (deepspeed_trainer:228) INFO: 18epoch:train:4101-4200batch: iter_time=1.126e-04, loss_ctc=77.334, loss_att=60.883, acc=0.687, loss=65.814, grad_norm=5.014, loss_scale=1.000, learning_rate=9.623e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 00:27:09,296 (deepspeed_trainer:228) INFO: 18epoch:train:4201-4300batch: iter_time=1.100e-04, loss_ctc=91.653, loss_att=66.006, acc=0.700, loss=73.691, grad_norm=5.798, loss_scale=1.000, learning_rate=9.622e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:27:46,091 (deepspeed_trainer:228) INFO: 18epoch:train:4301-4400batch: iter_time=1.115e-04, loss_ctc=76.241, loss_att=58.173, acc=0.694, loss=63.591, grad_norm=4.428, loss_scale=1.000, learning_rate=9.620e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 00:28:22,693 (deepspeed_trainer:228) INFO: 18epoch:train:4401-4500batch: iter_time=1.115e-04, loss_ctc=75.275, loss_att=56.631, acc=0.689, loss=62.239, grad_norm=4.551, loss_scale=1.000, learning_rate=9.618e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 00:28:59,228 (deepspeed_trainer:228) INFO: 18epoch:train:4501-4600batch: iter_time=1.106e-04, loss_ctc=70.400, loss_att=58.904, acc=0.689, loss=62.344, grad_norm=4.346, loss_scale=1.000, learning_rate=9.616e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 00:29:35,305 (deepspeed_trainer:228) INFO: 18epoch:train:4601-4700batch: iter_time=1.088e-04, loss_ctc=70.451, loss_att=51.048, acc=0.683, loss=56.874, grad_norm=4.842, loss_scale=1.000, learning_rate=9.614e-05, step_time=0.360 [cnode7-012:0/16] 2024-12-07 00:30:11,499 (deepspeed_trainer:228) INFO: 18epoch:train:4701-4800batch: iter_time=1.086e-04, loss_ctc=67.395, loss_att=47.223, acc=0.685, loss=53.241, grad_norm=4.527, loss_scale=1.000, learning_rate=9.612e-05, step_time=0.362 [cnode7-012:0/16] 2024-12-07 00:30:48,123 (deepspeed_trainer:228) INFO: 18epoch:train:4801-4900batch: iter_time=1.091e-04, loss_ctc=82.346, loss_att=58.216, acc=0.695, loss=65.451, grad_norm=4.879, loss_scale=1.000, learning_rate=9.610e-05, step_time=0.366 [2024-12-07 00:31:24,987] [INFO] [logging.py:129:log_dist] [Rank 0] step=260000, skipped=0, lr=[np.float64(9.607670752033088e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:31:24,988] [INFO] [timer.py:264:stop] epoch=0/micro_step=5000/global_step=5000, RunningAvgSamplesPerSec=44.27342266407035, CurrSamplesPerSec=44.18370054839173, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 00:31:24,989 (deepspeed_trainer:228) INFO: 18epoch:train:4901-5000batch: iter_time=1.085e-04, loss_ctc=76.148, loss_att=60.083, acc=0.691, loss=64.882, grad_norm=4.353, loss_scale=1.000, learning_rate=9.609e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 00:32:01,915 (deepspeed_trainer:228) INFO: 18epoch:train:5001-5100batch: iter_time=1.135e-04, loss_ctc=78.194, loss_att=52.400, acc=0.706, loss=60.142, grad_norm=4.280, loss_scale=1.000, learning_rate=9.607e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:32:39,141 (deepspeed_trainer:228) INFO: 18epoch:train:5101-5200batch: iter_time=1.090e-04, loss_ctc=71.984, loss_att=57.987, acc=0.694, loss=62.168, grad_norm=3.972, loss_scale=1.000, learning_rate=9.605e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 00:33:16,042 (deepspeed_trainer:228) INFO: 18epoch:train:5201-5300batch: iter_time=1.110e-04, loss_ctc=85.719, loss_att=65.095, acc=0.675, loss=71.231, grad_norm=5.036, loss_scale=1.000, learning_rate=9.603e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:33:52,533 (deepspeed_trainer:228) INFO: 18epoch:train:5301-5400batch: iter_time=1.103e-04, loss_ctc=71.611, loss_att=51.276, acc=0.704, loss=57.388, grad_norm=4.450, loss_scale=1.000, learning_rate=9.601e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 00:34:29,265 (deepspeed_trainer:228) INFO: 18epoch:train:5401-5500batch: iter_time=1.099e-04, loss_ctc=75.102, loss_att=56.170, acc=0.691, loss=61.857, grad_norm=4.496, loss_scale=1.000, learning_rate=9.599e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 00:35:05,814 (deepspeed_trainer:228) INFO: 18epoch:train:5501-5600batch: iter_time=1.064e-04, loss_ctc=74.814, loss_att=54.436, acc=0.695, loss=60.549, grad_norm=4.931, loss_scale=1.000, learning_rate=9.598e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 00:35:18,269 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 00:35:44,953 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 00:36:01,800 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 00:36:01,800 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 00:36:01,802 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 00:36:25,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:26,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:26,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:26,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:25,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:28,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:28,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:25,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:29,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:36:27,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:13,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:14,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:14,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:15,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:15,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:16,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:16,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:14,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:14,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:17,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:15,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:15,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:15,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:16,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:16,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:37:16,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:00,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:01,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:01,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:04,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:04,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:05,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:05,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:03,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:03,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:03,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:04,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:04,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:06,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:04,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:05,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:08,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:46,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:49,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:49,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:51,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:52,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:51,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:53,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:54,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:51,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:52,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:53,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:53,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:57,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:55,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:56,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:38:57,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 00:39:54,286 (deepspeed_trainer:228) INFO: 18epoch:train:5601-5700batch: iter_time=2.454, loss_ctc=68.475, loss_att=47.172, acc=0.706, loss=53.542, grad_norm=4.507, loss_scale=1.000, learning_rate=9.596e-05, step_time=0.430 [cnode7-012:0/16] 2024-12-07 00:40:31,305 (deepspeed_trainer:228) INFO: 18epoch:train:5701-5800batch: iter_time=1.071e-04, loss_ctc=75.146, loss_att=57.504, acc=0.702, loss=62.785, grad_norm=4.786, loss_scale=1.000, learning_rate=9.594e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 00:41:08,192 (deepspeed_trainer:228) INFO: 18epoch:train:5801-5900batch: iter_time=1.124e-04, loss_ctc=83.839, loss_att=57.046, acc=0.704, loss=65.103, grad_norm=5.874, loss_scale=1.000, learning_rate=9.592e-05, step_time=0.369 [2024-12-07 00:41:45,117] [INFO] [logging.py:129:log_dist] [Rank 0] step=261000, skipped=0, lr=[np.float64(9.589247659519035e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:41:45,118] [INFO] [timer.py:264:stop] epoch=0/micro_step=6000/global_step=6000, RunningAvgSamplesPerSec=44.19580528765462, CurrSamplesPerSec=42.66544072793417, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 00:41:45,119 (deepspeed_trainer:228) INFO: 18epoch:train:5901-6000batch: iter_time=1.185e-04, loss_ctc=81.395, loss_att=59.614, acc=0.688, loss=66.181, grad_norm=6.328, loss_scale=1.000, learning_rate=9.590e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:42:22,073 (deepspeed_trainer:228) INFO: 18epoch:train:6001-6100batch: iter_time=1.131e-04, loss_ctc=75.608, loss_att=59.199, acc=0.695, loss=64.133, grad_norm=4.481, loss_scale=1.000, learning_rate=9.588e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:42:59,412 (deepspeed_trainer:228) INFO: 18epoch:train:6101-6200batch: iter_time=1.134e-04, loss_ctc=91.414, loss_att=63.549, acc=0.699, loss=71.953, grad_norm=5.488, loss_scale=1.000, learning_rate=9.586e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 00:43:36,240 (deepspeed_trainer:228) INFO: 18epoch:train:6201-6300batch: iter_time=1.089e-04, loss_ctc=74.766, loss_att=58.119, acc=0.697, loss=63.074, grad_norm=4.772, loss_scale=1.000, learning_rate=9.585e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 00:44:12,774 (deepspeed_trainer:228) INFO: 18epoch:train:6301-6400batch: iter_time=1.127e-04, loss_ctc=75.136, loss_att=57.885, acc=0.688, loss=63.041, grad_norm=4.764, loss_scale=1.000, learning_rate=9.583e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 00:44:49,145 (deepspeed_trainer:228) INFO: 18epoch:train:6401-6500batch: iter_time=1.084e-04, loss_ctc=70.222, loss_att=56.125, acc=0.694, loss=60.344, grad_norm=4.655, loss_scale=1.000, learning_rate=9.581e-05, step_time=0.363 [cnode7-012:0/16] 2024-12-07 00:45:25,174 (deepspeed_trainer:228) INFO: 18epoch:train:6501-6600batch: iter_time=1.081e-04, loss_ctc=68.784, loss_att=50.184, acc=0.676, loss=55.762, grad_norm=4.858, loss_scale=1.000, learning_rate=9.579e-05, step_time=0.360 [cnode7-012:0/16] 2024-12-07 00:46:01,615 (deepspeed_trainer:228) INFO: 18epoch:train:6601-6700batch: iter_time=1.076e-04, loss_ctc=72.179, loss_att=47.278, acc=0.698, loss=54.740, grad_norm=4.684, loss_scale=1.000, learning_rate=9.577e-05, step_time=0.364 [cnode7-012:0/16] 2024-12-07 00:46:38,343 (deepspeed_trainer:228) INFO: 18epoch:train:6701-6800batch: iter_time=1.085e-04, loss_ctc=78.488, loss_att=64.166, acc=0.685, loss=68.480, grad_norm=4.899, loss_scale=1.000, learning_rate=9.575e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 00:47:15,104 (deepspeed_trainer:228) INFO: 18epoch:train:6801-6900batch: iter_time=1.095e-04, loss_ctc=80.980, loss_att=54.883, acc=0.701, loss=62.729, grad_norm=4.382, loss_scale=1.000, learning_rate=9.574e-05, step_time=0.367 [2024-12-07 00:47:51,682] [INFO] [logging.py:129:log_dist] [Rank 0] step=262000, skipped=0, lr=[np.float64(9.570930143224013e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:47:51,683] [INFO] [timer.py:264:stop] epoch=0/micro_step=7000/global_step=7000, RunningAvgSamplesPerSec=44.28058194378211, CurrSamplesPerSec=46.15953480012171, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 00:47:51,685 (deepspeed_trainer:228) INFO: 18epoch:train:6901-7000batch: iter_time=1.117e-04, loss_ctc=73.382, loss_att=53.931, acc=0.699, loss=59.786, grad_norm=4.408, loss_scale=1.000, learning_rate=9.572e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 00:48:28,367 (deepspeed_trainer:228) INFO: 18epoch:train:7001-7100batch: iter_time=1.096e-04, loss_ctc=75.579, loss_att=59.000, acc=0.697, loss=63.971, grad_norm=4.432, loss_scale=1.000, learning_rate=9.570e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 00:49:05,067 (deepspeed_trainer:228) INFO: 18epoch:train:7101-7200batch: iter_time=1.083e-04, loss_ctc=83.932, loss_att=59.925, acc=0.696, loss=67.143, grad_norm=5.172, loss_scale=1.000, learning_rate=9.568e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 00:49:41,427 (deepspeed_trainer:228) INFO: 18epoch:train:7201-7300batch: iter_time=1.098e-04, loss_ctc=68.441, loss_att=50.153, acc=0.700, loss=55.648, grad_norm=3.992, loss_scale=1.000, learning_rate=9.566e-05, step_time=0.363 [cnode7-012:0/16] 2024-12-07 00:50:18,630 (deepspeed_trainer:228) INFO: 18epoch:train:7301-7400batch: iter_time=1.086e-04, loss_ctc=76.862, loss_att=57.419, acc=0.691, loss=63.216, grad_norm=4.978, loss_scale=1.000, learning_rate=9.565e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 00:50:55,508 (deepspeed_trainer:228) INFO: 18epoch:train:7401-7500batch: iter_time=1.056e-04, loss_ctc=69.994, loss_att=51.489, acc=0.698, loss=57.050, grad_norm=4.506, loss_scale=1.000, learning_rate=9.563e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 00:50:59,229 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 00:51:24,978 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 00:51:41,415 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 00:51:41,415 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 00:51:41,417 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 00:52:06,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:07,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:07,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:05,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:05,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:05,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:05,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:08,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:09,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:06,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:06,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:09,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:07,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:07,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:10,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:11,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:53,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:55,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:55,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:53,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:54,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:56,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:54,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:57,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:54,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:55,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:57,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:58,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:55,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:59,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:57,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:52:58,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:42,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:42,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:41,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:44,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:42,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:45,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:42,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:45,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:42,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:45,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:43,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:43,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:46,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:49,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:47,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:53:48,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:28,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:29,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:28,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:29,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:32,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:30,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:33,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:30,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:33,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:31,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:34,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:35,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:33,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:36,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:39,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 00:54:37,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 00:55:43,748 (deepspeed_trainer:228) INFO: 18epoch:train:7501-7600batch: iter_time=2.447, loss_ctc=77.150, loss_att=55.182, acc=0.708, loss=61.744, grad_norm=4.581, loss_scale=1.000, learning_rate=9.561e-05, step_time=0.435 [cnode7-012:0/16] 2024-12-07 00:56:20,315 (deepspeed_trainer:228) INFO: 18epoch:train:7601-7700batch: iter_time=1.020e-04, loss_ctc=66.070, loss_att=51.087, acc=0.709, loss=55.559, grad_norm=4.787, loss_scale=1.000, learning_rate=9.559e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 00:56:57,172 (deepspeed_trainer:228) INFO: 18epoch:train:7701-7800batch: iter_time=1.051e-04, loss_ctc=86.267, loss_att=60.218, acc=0.707, loss=68.043, grad_norm=6.036, loss_scale=1.000, learning_rate=9.557e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 00:57:34,041 (deepspeed_trainer:228) INFO: 18epoch:train:7801-7900batch: iter_time=1.059e-04, loss_ctc=81.252, loss_att=63.526, acc=0.687, loss=68.837, grad_norm=6.463, loss_scale=1.000, learning_rate=9.555e-05, step_time=0.368 [2024-12-07 00:58:11,182] [INFO] [logging.py:129:log_dist] [Rank 0] step=263000, skipped=0, lr=[np.float64(9.552717198618607e-05)], mom=[[0.9, 0.98]] [2024-12-07 00:58:11,183] [INFO] [timer.py:264:stop] epoch=0/micro_step=8000/global_step=8000, RunningAvgSamplesPerSec=44.2146255461957, CurrSamplesPerSec=44.371310016329254, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 00:58:11,184 (deepspeed_trainer:228) INFO: 18epoch:train:7901-8000batch: iter_time=1.044e-04, loss_ctc=74.855, loss_att=58.654, acc=0.706, loss=63.498, grad_norm=4.221, loss_scale=1.000, learning_rate=9.554e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 00:58:48,457 (deepspeed_trainer:228) INFO: 18epoch:train:8001-8100batch: iter_time=1.059e-04, loss_ctc=86.970, loss_att=65.332, acc=0.695, loss=71.835, grad_norm=5.280, loss_scale=1.000, learning_rate=9.552e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 00:59:25,670 (deepspeed_trainer:228) INFO: 18epoch:train:8101-8200batch: iter_time=1.071e-04, loss_ctc=80.245, loss_att=58.559, acc=0.703, loss=65.074, grad_norm=5.085, loss_scale=1.000, learning_rate=9.550e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 01:00:02,939 (deepspeed_trainer:228) INFO: 18epoch:train:8201-8300batch: iter_time=1.049e-04, loss_ctc=75.884, loss_att=63.112, acc=0.683, loss=66.919, grad_norm=4.345, loss_scale=1.000, learning_rate=9.548e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 01:00:39,864 (deepspeed_trainer:228) INFO: 18epoch:train:8301-8400batch: iter_time=1.059e-04, loss_ctc=66.610, loss_att=49.597, acc=0.696, loss=54.702, grad_norm=4.316, loss_scale=1.000, learning_rate=9.546e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 01:01:16,405 (deepspeed_trainer:228) INFO: 18epoch:train:8401-8500batch: iter_time=1.034e-04, loss_ctc=65.142, loss_att=48.671, acc=0.689, loss=53.630, grad_norm=4.998, loss_scale=1.000, learning_rate=9.545e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 01:01:53,199 (deepspeed_trainer:228) INFO: 18epoch:train:8501-8600batch: iter_time=1.053e-04, loss_ctc=81.399, loss_att=54.869, acc=0.697, loss=62.854, grad_norm=5.150, loss_scale=1.000, learning_rate=9.543e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 01:02:29,929 (deepspeed_trainer:228) INFO: 18epoch:train:8601-8700batch: iter_time=1.065e-04, loss_ctc=70.166, loss_att=59.280, acc=0.692, loss=62.532, grad_norm=4.593, loss_scale=1.000, learning_rate=9.541e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 01:03:06,947 (deepspeed_trainer:228) INFO: 18epoch:train:8701-8800batch: iter_time=1.047e-04, loss_ctc=80.349, loss_att=54.157, acc=0.718, loss=62.019, grad_norm=4.609, loss_scale=1.000, learning_rate=9.539e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:03:43,742 (deepspeed_trainer:228) INFO: 18epoch:train:8801-8900batch: iter_time=1.046e-04, loss_ctc=70.114, loss_att=56.316, acc=0.690, loss=60.450, grad_norm=4.507, loss_scale=1.000, learning_rate=9.537e-05, step_time=0.368 [2024-12-07 01:04:20,783] [INFO] [logging.py:129:log_dist] [Rank 0] step=264000, skipped=0, lr=[np.float64(9.534607834503641e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:04:20,783] [INFO] [timer.py:264:stop] epoch=0/micro_step=9000/global_step=9000, RunningAvgSamplesPerSec=44.23153501144783, CurrSamplesPerSec=45.39670209029924, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:04:20,785 (deepspeed_trainer:228) INFO: 18epoch:train:8901-9000batch: iter_time=1.074e-04, loss_ctc=82.947, loss_att=60.938, acc=0.701, loss=67.571, grad_norm=4.672, loss_scale=1.000, learning_rate=9.536e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:04:57,719 (deepspeed_trainer:228) INFO: 18epoch:train:9001-9100batch: iter_time=1.059e-04, loss_ctc=79.510, loss_att=56.653, acc=0.707, loss=63.510, grad_norm=4.641, loss_scale=1.000, learning_rate=9.534e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 01:05:34,114 (deepspeed_trainer:228) INFO: 18epoch:train:9101-9200batch: iter_time=1.064e-04, loss_ctc=72.608, loss_att=53.857, acc=0.697, loss=59.506, grad_norm=4.704, loss_scale=1.000, learning_rate=9.532e-05, step_time=0.363 [cnode7-012:0/16] 2024-12-07 01:06:10,802 (deepspeed_trainer:228) INFO: 18epoch:train:9201-9300batch: iter_time=1.075e-04, loss_ctc=77.426, loss_att=60.335, acc=0.700, loss=65.468, grad_norm=4.502, loss_scale=1.000, learning_rate=9.530e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 01:06:42,047 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 01:07:08,009 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 01:07:24,178 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 01:07:24,178 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 01:07:24,181 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 01:07:49,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:49,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:48,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:51,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:48,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:49,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:51,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:52,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:49,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:49,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:52,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:52,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:50,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:50,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:53,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:07:50,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:37,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:36,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:36,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:39,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:39,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:37,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:40,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:38,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:41,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:39,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:42,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:39,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:39,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:39,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:42,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:08:43,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:24,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:24,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:24,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:28,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:25,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:28,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:26,691] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:29,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:27,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:30,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:30,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:28,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:28,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:28,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:31,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:09:38,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:10,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:12,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:12,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:15,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:13,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:16,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:17,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:15,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:17,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:15,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:16,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:16,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:16,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:19,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:20,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:10:33,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 01:11:08,389 (deepspeed_trainer:228) INFO: 18epoch:train:9301-9400batch: iter_time=2.469, loss_ctc=66.650, loss_att=45.974, acc=0.711, loss=52.184, grad_norm=4.125, loss_scale=1.000, learning_rate=9.528e-05, step_time=0.507 [cnode7-012:0/16] 2024-12-07 01:11:45,857 (deepspeed_trainer:228) INFO: 18epoch:train:9401-9500batch: iter_time=1.039e-04, loss_ctc=72.689, loss_att=51.730, acc=0.708, loss=58.014, grad_norm=4.289, loss_scale=1.000, learning_rate=9.526e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 01:12:23,279 (deepspeed_trainer:228) INFO: 18epoch:train:9501-9600batch: iter_time=1.028e-04, loss_ctc=68.415, loss_att=53.217, acc=0.710, loss=57.757, grad_norm=4.197, loss_scale=1.000, learning_rate=9.525e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 01:13:00,918 (deepspeed_trainer:228) INFO: 18epoch:train:9601-9700batch: iter_time=1.069e-04, loss_ctc=92.124, loss_att=60.481, acc=0.705, loss=69.969, grad_norm=7.069, loss_scale=1.000, learning_rate=9.523e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 01:13:38,586 (deepspeed_trainer:228) INFO: 18epoch:train:9701-9800batch: iter_time=1.064e-04, loss_ctc=72.440, loss_att=61.103, acc=0.693, loss=64.508, grad_norm=4.311, loss_scale=1.000, learning_rate=9.521e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 01:14:16,246 (deepspeed_trainer:228) INFO: 18epoch:train:9801-9900batch: iter_time=1.061e-04, loss_ctc=77.275, loss_att=58.355, acc=0.709, loss=64.024, grad_norm=4.506, loss_scale=1.000, learning_rate=9.519e-05, step_time=0.376 [2024-12-07 01:14:54,220] [INFO] [logging.py:129:log_dist] [Rank 0] step=265000, skipped=0, lr=[np.float64(9.516601072783589e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:14:54,221] [INFO] [timer.py:264:stop] epoch=0/micro_step=10000/global_step=10000, RunningAvgSamplesPerSec=44.042055136579314, CurrSamplesPerSec=44.19624191351286, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:14:54,222 (deepspeed_trainer:228) INFO: 18epoch:train:9901-10000batch: iter_time=1.058e-04, loss_ctc=91.955, loss_att=67.167, acc=0.691, loss=74.632, grad_norm=5.855, loss_scale=1.000, learning_rate=9.517e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 01:15:32,119 (deepspeed_trainer:228) INFO: 18epoch:train:10001-10100batch: iter_time=1.063e-04, loss_ctc=76.350, loss_att=57.529, acc=0.701, loss=63.181, grad_norm=4.372, loss_scale=1.000, learning_rate=9.516e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 01:16:09,235 (deepspeed_trainer:228) INFO: 18epoch:train:10101-10200batch: iter_time=1.044e-04, loss_ctc=69.620, loss_att=58.370, acc=0.687, loss=61.712, grad_norm=4.849, loss_scale=1.000, learning_rate=9.514e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 01:16:46,124 (deepspeed_trainer:228) INFO: 18epoch:train:10201-10300batch: iter_time=1.046e-04, loss_ctc=69.694, loss_att=52.279, acc=0.691, loss=57.507, grad_norm=4.828, loss_scale=1.000, learning_rate=9.512e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 01:17:23,838 (deepspeed_trainer:228) INFO: 18epoch:train:10301-10400batch: iter_time=1.085e-04, loss_ctc=64.939, loss_att=45.266, acc=0.694, loss=51.182, grad_norm=4.773, loss_scale=1.000, learning_rate=9.510e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 01:18:00,915 (deepspeed_trainer:228) INFO: 18epoch:train:10401-10500batch: iter_time=1.053e-04, loss_ctc=76.958, loss_att=53.675, acc=0.699, loss=60.667, grad_norm=4.795, loss_scale=1.000, learning_rate=9.509e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:18:38,362 (deepspeed_trainer:228) INFO: 18epoch:train:10501-10600batch: iter_time=1.113e-04, loss_ctc=74.609, loss_att=63.895, acc=0.694, loss=67.085, grad_norm=4.263, loss_scale=1.000, learning_rate=9.507e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 01:19:15,524 (deepspeed_trainer:228) INFO: 18epoch:train:10601-10700batch: iter_time=1.082e-04, loss_ctc=84.565, loss_att=55.951, acc=0.705, loss=64.534, grad_norm=4.504, loss_scale=1.000, learning_rate=9.505e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 01:19:52,275 (deepspeed_trainer:228) INFO: 18epoch:train:10701-10800batch: iter_time=1.079e-04, loss_ctc=65.719, loss_att=52.018, acc=0.708, loss=56.161, grad_norm=4.205, loss_scale=1.000, learning_rate=9.503e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 01:20:30,213 (deepspeed_trainer:228) INFO: 18epoch:train:10801-10900batch: iter_time=1.120e-04, loss_ctc=81.194, loss_att=63.419, acc=0.695, loss=68.707, grad_norm=4.392, loss_scale=1.000, learning_rate=9.501e-05, step_time=0.379 [2024-12-07 01:21:07,662] [INFO] [logging.py:129:log_dist] [Rank 0] step=266000, skipped=0, lr=[np.float64(9.498695948244701e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:21:07,663] [INFO] [timer.py:264:stop] epoch=0/micro_step=11000/global_step=11000, RunningAvgSamplesPerSec=44.03151909053076, CurrSamplesPerSec=43.87260675808988, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:21:07,664 (deepspeed_trainer:228) INFO: 18epoch:train:10901-11000batch: iter_time=1.120e-04, loss_ctc=79.659, loss_att=55.663, acc=0.708, loss=62.858, grad_norm=4.342, loss_scale=1.000, learning_rate=9.500e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 01:21:44,694 (deepspeed_trainer:228) INFO: 18epoch:train:11001-11100batch: iter_time=1.095e-04, loss_ctc=73.549, loss_att=56.688, acc=0.701, loss=61.722, grad_norm=4.642, loss_scale=1.000, learning_rate=9.498e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:22:21,676 (deepspeed_trainer:228) INFO: 18epoch:train:11101-11200batch: iter_time=1.067e-04, loss_ctc=74.731, loss_att=55.502, acc=0.701, loss=61.275, grad_norm=4.581, loss_scale=1.000, learning_rate=9.496e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:22:44,560 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 01:23:10,751 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 01:23:26,776 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 01:23:26,776 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 01:23:26,778 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 01:23:51,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:51,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:50,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:54,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:51,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:54,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:54,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:55,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:55,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:52,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:52,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:53,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:53,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:53,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:53,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:23:57,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:39,231] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:39,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:42,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:42,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:40,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:43,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:40,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:43,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:40,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:41,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:41,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:44,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:41,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:42,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:44,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:24:47,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:25,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:26,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:29,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:28,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:31,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:28,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:28,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:31,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:31,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:29,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:29,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:32,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:30,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:30,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:37,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:25:34,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:12,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:13,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:16,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:15,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:16,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:19,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:16,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:19,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:20,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:17,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:20,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:17,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:18,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:20,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:26,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:26:24,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 01:27:11,654 (deepspeed_trainer:228) INFO: 18epoch:train:11201-11300batch: iter_time=2.463, loss_ctc=67.240, loss_att=45.953, acc=0.716, loss=52.346, grad_norm=4.208, loss_scale=1.000, learning_rate=9.494e-05, step_time=0.436 [cnode7-012:0/16] 2024-12-07 01:27:49,369 (deepspeed_trainer:228) INFO: 18epoch:train:11301-11400batch: iter_time=9.885e-05, loss_ctc=73.220, loss_att=52.735, acc=0.710, loss=58.874, grad_norm=4.050, loss_scale=1.000, learning_rate=9.492e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 01:28:27,604 (deepspeed_trainer:228) INFO: 18epoch:train:11401-11500batch: iter_time=1.016e-04, loss_ctc=65.734, loss_att=49.867, acc=0.713, loss=54.624, grad_norm=4.283, loss_scale=1.000, learning_rate=9.491e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 01:29:04,972 (deepspeed_trainer:228) INFO: 18epoch:train:11501-11600batch: iter_time=1.047e-04, loss_ctc=95.403, loss_att=65.654, acc=0.695, loss=74.606, grad_norm=7.252, loss_scale=1.000, learning_rate=9.489e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 01:29:42,559 (deepspeed_trainer:228) INFO: 18epoch:train:11601-11700batch: iter_time=1.056e-04, loss_ctc=75.989, loss_att=60.519, acc=0.696, loss=65.132, grad_norm=4.838, loss_scale=1.000, learning_rate=9.487e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 01:30:19,954 (deepspeed_trainer:228) INFO: 18epoch:train:11701-11800batch: iter_time=1.049e-04, loss_ctc=88.268, loss_att=67.137, acc=0.706, loss=73.465, grad_norm=6.068, loss_scale=1.000, learning_rate=9.485e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 01:30:56,904 (deepspeed_trainer:228) INFO: 18epoch:train:11801-11900batch: iter_time=1.053e-04, loss_ctc=74.987, loss_att=58.054, acc=0.699, loss=63.141, grad_norm=4.639, loss_scale=1.000, learning_rate=9.484e-05, step_time=0.369 [2024-12-07 01:31:33,922] [INFO] [logging.py:129:log_dist] [Rank 0] step=267000, skipped=0, lr=[np.float64(9.480891508337689e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:31:33,923] [INFO] [timer.py:264:stop] epoch=0/micro_step=12000/global_step=12000, RunningAvgSamplesPerSec=43.95726353360236, CurrSamplesPerSec=46.98091529388967, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:31:33,924 (deepspeed_trainer:228) INFO: 18epoch:train:11901-12000batch: iter_time=1.047e-04, loss_ctc=74.435, loss_att=57.395, acc=0.694, loss=62.552, grad_norm=4.640, loss_scale=1.000, learning_rate=9.482e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:32:10,696 (deepspeed_trainer:228) INFO: 18epoch:train:12001-12100batch: iter_time=1.060e-04, loss_ctc=69.934, loss_att=58.374, acc=0.693, loss=61.839, grad_norm=4.348, loss_scale=1.000, learning_rate=9.480e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 01:32:47,403 (deepspeed_trainer:228) INFO: 18epoch:train:12101-12200batch: iter_time=1.057e-04, loss_ctc=68.426, loss_att=49.997, acc=0.692, loss=55.534, grad_norm=4.639, loss_scale=1.000, learning_rate=9.478e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 01:33:24,618 (deepspeed_trainer:228) INFO: 18epoch:train:12201-12300batch: iter_time=1.123e-04, loss_ctc=66.390, loss_att=46.749, acc=0.694, loss=52.653, grad_norm=4.229, loss_scale=1.000, learning_rate=9.476e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 01:34:02,044 (deepspeed_trainer:228) INFO: 18epoch:train:12301-12400batch: iter_time=1.110e-04, loss_ctc=80.978, loss_att=57.635, acc=0.700, loss=64.634, grad_norm=4.587, loss_scale=1.000, learning_rate=9.475e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 01:34:39,483 (deepspeed_trainer:228) INFO: 18epoch:train:12401-12500batch: iter_time=1.079e-04, loss_ctc=75.933, loss_att=60.394, acc=0.703, loss=65.073, grad_norm=4.521, loss_scale=1.000, learning_rate=9.473e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 01:35:16,721 (deepspeed_trainer:228) INFO: 18epoch:train:12501-12600batch: iter_time=1.047e-04, loss_ctc=77.705, loss_att=52.391, acc=0.711, loss=60.008, grad_norm=4.226, loss_scale=1.000, learning_rate=9.471e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 01:35:53,451 (deepspeed_trainer:228) INFO: 18epoch:train:12601-12700batch: iter_time=1.059e-04, loss_ctc=71.871, loss_att=57.242, acc=0.705, loss=61.652, grad_norm=3.893, loss_scale=1.000, learning_rate=9.469e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 01:36:30,466 (deepspeed_trainer:228) INFO: 18epoch:train:12701-12800batch: iter_time=1.055e-04, loss_ctc=84.331, loss_att=64.346, acc=0.685, loss=70.343, grad_norm=5.187, loss_scale=1.000, learning_rate=9.468e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:37:07,161 (deepspeed_trainer:228) INFO: 18epoch:train:12801-12900batch: iter_time=1.057e-04, loss_ctc=71.004, loss_att=50.858, acc=0.713, loss=56.909, grad_norm=3.998, loss_scale=1.000, learning_rate=9.466e-05, step_time=0.367 [2024-12-07 01:37:43,976] [INFO] [logging.py:129:log_dist] [Rank 0] step=268000, skipped=0, lr=[np.float64(9.463186812964868e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:37:43,977] [INFO] [timer.py:264:stop] epoch=0/micro_step=13000/global_step=13000, RunningAvgSamplesPerSec=43.98944557216891, CurrSamplesPerSec=44.001796197117116, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:37:43,978 (deepspeed_trainer:228) INFO: 18epoch:train:12901-13000batch: iter_time=1.049e-04, loss_ctc=74.278, loss_att=56.629, acc=0.698, loss=61.946, grad_norm=4.320, loss_scale=1.000, learning_rate=9.464e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 01:38:20,569 (deepspeed_trainer:228) INFO: 18epoch:train:13001-13100batch: iter_time=1.059e-04, loss_ctc=73.623, loss_att=53.682, acc=0.704, loss=59.672, grad_norm=4.983, loss_scale=1.000, learning_rate=9.462e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 01:38:33,225 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 01:38:59,594 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 01:39:15,065 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 01:39:15,065 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 01:39:15,068 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 01:39:42,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:42,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:43,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:43,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:43,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:41,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:41,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:44,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:44,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:41,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:44,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:42,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:42,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:42,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:42,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:39:43,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:30,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:30,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:30,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:31,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:32,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:29,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:32,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:29,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:32,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:33,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:30,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:31,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:31,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:31,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:32,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:40:32,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:18,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:18,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:18,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:20,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:17,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:20,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:18,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:21,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:21,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:21,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:19,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:19,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:19,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:19,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:20,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:41:20,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:05,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:05,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:08,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:05,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:08,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:06,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:09,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:09,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:09,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:06,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:07,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:07,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:08,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:08,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:08,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:42:12,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 01:43:05,967 (deepspeed_trainer:228) INFO: 18epoch:train:13101-13200batch: iter_time=2.465, loss_ctc=67.624, loss_att=46.556, acc=0.709, loss=52.871, grad_norm=4.489, loss_scale=1.000, learning_rate=9.461e-05, step_time=0.389 [cnode7-012:0/16] 2024-12-07 01:43:43,026 (deepspeed_trainer:228) INFO: 18epoch:train:13201-13300batch: iter_time=1.043e-04, loss_ctc=74.739, loss_att=58.891, acc=0.702, loss=63.623, grad_norm=4.358, loss_scale=1.000, learning_rate=9.459e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 01:44:19,880 (deepspeed_trainer:228) INFO: 18epoch:train:13301-13400batch: iter_time=1.054e-04, loss_ctc=82.808, loss_att=56.202, acc=0.709, loss=64.176, grad_norm=5.553, loss_scale=1.000, learning_rate=9.457e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 01:44:56,866 (deepspeed_trainer:228) INFO: 18epoch:train:13401-13500batch: iter_time=1.091e-04, loss_ctc=80.963, loss_att=60.108, acc=0.690, loss=66.393, grad_norm=5.909, loss_scale=1.000, learning_rate=9.455e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 01:45:33,987 (deepspeed_trainer:228) INFO: 18epoch:train:13501-13600batch: iter_time=1.100e-04, loss_ctc=74.741, loss_att=59.014, acc=0.698, loss=63.749, grad_norm=4.257, loss_scale=1.000, learning_rate=9.453e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 01:46:11,443 (deepspeed_trainer:228) INFO: 18epoch:train:13601-13700batch: iter_time=1.090e-04, loss_ctc=89.819, loss_att=63.192, acc=0.699, loss=71.199, grad_norm=5.829, loss_scale=1.000, learning_rate=9.452e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 01:46:48,664 (deepspeed_trainer:228) INFO: 18epoch:train:13701-13800batch: iter_time=1.108e-04, loss_ctc=73.674, loss_att=57.910, acc=0.699, loss=62.632, grad_norm=4.378, loss_scale=1.000, learning_rate=9.450e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 01:47:25,672 (deepspeed_trainer:228) INFO: 18epoch:train:13801-13900batch: iter_time=1.089e-04, loss_ctc=74.059, loss_att=58.254, acc=0.688, loss=63.011, grad_norm=4.319, loss_scale=1.000, learning_rate=9.448e-05, step_time=0.370 [2024-12-07 01:48:02,652] [INFO] [logging.py:129:log_dist] [Rank 0] step=269000, skipped=0, lr=[np.float64(9.445580934271679e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:48:02,652] [INFO] [timer.py:264:stop] epoch=0/micro_step=14000/global_step=14000, RunningAvgSamplesPerSec=43.99185989081431, CurrSamplesPerSec=41.74838309390105, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:48:02,654 (deepspeed_trainer:228) INFO: 18epoch:train:13901-14000batch: iter_time=1.111e-04, loss_ctc=69.404, loss_att=55.366, acc=0.698, loss=59.578, grad_norm=4.566, loss_scale=1.000, learning_rate=9.446e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 01:48:38,965 (deepspeed_trainer:228) INFO: 18epoch:train:14001-14100batch: iter_time=1.081e-04, loss_ctc=67.772, loss_att=49.500, acc=0.679, loss=54.978, grad_norm=4.841, loss_scale=1.000, learning_rate=9.445e-05, step_time=0.363 [cnode7-012:0/16] 2024-12-07 01:49:15,169 (deepspeed_trainer:228) INFO: 18epoch:train:14101-14200batch: iter_time=1.074e-04, loss_ctc=71.013, loss_att=46.622, acc=0.701, loss=53.953, grad_norm=4.466, loss_scale=1.000, learning_rate=9.443e-05, step_time=0.362 [cnode7-012:0/16] 2024-12-07 01:49:52,486 (deepspeed_trainer:228) INFO: 18epoch:train:14201-14300batch: iter_time=1.070e-04, loss_ctc=76.431, loss_att=63.579, acc=0.687, loss=67.404, grad_norm=5.066, loss_scale=1.000, learning_rate=9.441e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 01:50:29,199 (deepspeed_trainer:228) INFO: 18epoch:train:14301-14400batch: iter_time=1.084e-04, loss_ctc=80.873, loss_att=55.371, acc=0.702, loss=63.034, grad_norm=4.670, loss_scale=1.000, learning_rate=9.439e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 01:51:05,751 (deepspeed_trainer:228) INFO: 18epoch:train:14401-14500batch: iter_time=1.076e-04, loss_ctc=72.709, loss_att=54.030, acc=0.700, loss=59.657, grad_norm=4.220, loss_scale=1.000, learning_rate=9.438e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 01:51:42,303 (deepspeed_trainer:228) INFO: 18epoch:train:14501-14600batch: iter_time=1.084e-04, loss_ctc=75.319, loss_att=59.651, acc=0.697, loss=64.349, grad_norm=4.280, loss_scale=1.000, learning_rate=9.436e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 01:52:20,087 (deepspeed_trainer:228) INFO: 18epoch:train:14601-14700batch: iter_time=1.087e-04, loss_ctc=83.259, loss_att=59.361, acc=0.700, loss=66.536, grad_norm=4.648, loss_scale=1.000, learning_rate=9.434e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 01:52:56,495 (deepspeed_trainer:228) INFO: 18epoch:train:14701-14800batch: iter_time=1.074e-04, loss_ctc=67.404, loss_att=50.010, acc=0.702, loss=55.235, grad_norm=4.155, loss_scale=1.000, learning_rate=9.432e-05, step_time=0.364 [cnode7-012:0/16] 2024-12-07 01:53:33,300 (deepspeed_trainer:228) INFO: 18epoch:train:14801-14900batch: iter_time=1.083e-04, loss_ctc=76.452, loss_att=56.691, acc=0.693, loss=62.614, grad_norm=4.693, loss_scale=1.000, learning_rate=9.431e-05, step_time=0.368 [2024-12-07 01:54:10,143] [INFO] [logging.py:129:log_dist] [Rank 0] step=270000, skipped=0, lr=[np.float64(9.428072956442436e-05)], mom=[[0.9, 0.98]] [2024-12-07 01:54:10,144] [INFO] [timer.py:264:stop] epoch=0/micro_step=15000/global_step=15000, RunningAvgSamplesPerSec=44.0309384459358, CurrSamplesPerSec=47.856855357457505, MemAllocated=1.77GB, MaxMemAllocated=15.26GB [cnode7-012:0/16] 2024-12-07 01:54:10,145 (deepspeed_trainer:228) INFO: 18epoch:train:14901-15000batch: iter_time=1.038e-04, loss_ctc=70.166, loss_att=51.006, acc=0.700, loss=56.737, grad_norm=4.329, loss_scale=1.000, learning_rate=9.429e-05, step_time=0.368 [2024-12-07 01:54:21,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:21,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:24,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:24,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:25,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:22,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:25,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:22,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:22,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:25,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:22,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:23,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:26,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:23,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:26,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:26,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:37,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:37,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:38,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:41,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:41,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:38,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:42,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:42,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:42,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:39,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:39,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:43,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:40,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:40,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:43,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:43,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:52,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:54,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:57,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:54,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:57,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:54,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:55,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:58,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:58,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:55,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:58,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:56,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:59,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:59,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:59,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:54:56,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:08,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:12,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:09,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:09,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:13,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:10,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:13,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:11,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:14,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:14,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:11,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:11,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:14,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:15,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:15,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:12,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:55:28,095] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 18 is about to be saved! [2024-12-07 01:55:28,130] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/mp_rank_00_model_states.pt [2024-12-07 01:55:28,130] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/mp_rank_00_model_states.pt... [2024-12-07 01:55:30,102] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/mp_rank_00_model_states.pt. [2024-12-07 01:55:30,258] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,253] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,255] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,255] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,256] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,262] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,257] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,265] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,265] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,265] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,265] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,259] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,265] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,259] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,266] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 01:55:27,260] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 01:55:30,923] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 01:55:30,930] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 01:55:30,930] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:31,660] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 01:55:31,660] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 01:55:31,660] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:31,664] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 01:55:31,665] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 01:55:31,665] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:32,261] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 01:55:32,261] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 01:55:32,261] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:32,281] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 01:55:32,281] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 01:55:32,281] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:29,276] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 01:55:29,276] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 01:55:29,276] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 01:55:29,276] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 01:55:29,276] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:29,276] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:29,276] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 01:55:29,276] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 01:55:29,276] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:32,555] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 01:55:32,556] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 01:55:32,556] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:32,680] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 01:55:32,680] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 01:55:32,680] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:32,914] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 01:55:32,914] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 01:55:32,914] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:30,017] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 01:55:30,017] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 01:55:30,017] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:30,245] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 01:55:30,245] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 01:55:30,245] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:30,330] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 01:55:30,330] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 01:55:30,330] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:31,131] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 01:55:31,131] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 01:55:31,131] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [2024-12-07 01:55:31,357] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 01:55:31,357] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_18/18/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 01:55:31,357] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 18 is ready now! [cnode7-012:0/16] 2024-12-07 01:55:34,464 (deepspeed_trainer:158) INFO: 18epoch results: [train] iter_time=0.131, loss_ctc=76.355, loss_att=56.552, acc=0.697, loss=62.493, grad_norm=4.763, loss_scale=1.000, learning_rate=9.563e-05, step_time=0.374, time=2 hours, 6 minutes and 23.1 seconds, total_count=270018, gpu_max_cached_mem_GB=27.982, [valid] loss_ctc=4.562, cer_ctc=0.097, loss_att=6.625, acc=0.848, cer=0.261, wer=0.875, loss=6.000, time=1 minute and 9.37 seconds, total_count=18, gpu_max_cached_mem_GB=27.982 [cnode7-012:0/16] 2024-12-07 01:55:36,715 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 01:56:03,023 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 01:56:19,041 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 01:56:19,041 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 01:56:19,043 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 01:56:35,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:35,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:39,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:39,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:36,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:37,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:37,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:37,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:41,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:38,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:41,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:41,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:42,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:42,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:39,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:56:42,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:25,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:23,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:24,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:24,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:27,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:24,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:25,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:25,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:28,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:28,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:29,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:29,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:26,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:30,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:27,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:57:32,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:11,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:11,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:11,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:12,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:16,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:13,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:13,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:16,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:13,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:17,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:17,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:17,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:15,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:15,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:20,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:21,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:57,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:58,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:59,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:58:59,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:02,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:02,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:01,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:02,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:02,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:04,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:04,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:04,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:02,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:04,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:09,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 01:59:10,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): [cnode7-012:0/16] 2024-12-07 02:00:13,797 (deepspeed_trainer:228) INFO: 19epoch:train:1-100batch: iter_time=2.353, loss_ctc=78.124, loss_att=59.694, acc=0.704, loss=65.183, grad_norm=4.422, loss_scale=1.000, learning_rate=9.427e-05, step_time=0.427 [cnode7-012:0/16] 2024-12-07 02:00:50,959 (deepspeed_trainer:228) INFO: 19epoch:train:101-200batch: iter_time=1.062e-04, loss_ctc=70.794, loss_att=51.836, acc=0.700, loss=57.528, grad_norm=4.562, loss_scale=1.000, learning_rate=9.425e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:01:28,434 (deepspeed_trainer:228) INFO: 19epoch:train:201-300batch: iter_time=1.084e-04, loss_ctc=78.590, loss_att=60.892, acc=0.698, loss=66.215, grad_norm=4.817, loss_scale=1.000, learning_rate=9.424e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 02:02:05,937 (deepspeed_trainer:228) INFO: 19epoch:train:301-400batch: iter_time=1.128e-04, loss_ctc=75.837, loss_att=59.648, acc=0.690, loss=64.483, grad_norm=4.628, loss_scale=1.000, learning_rate=9.422e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:02:43,599 (deepspeed_trainer:228) INFO: 19epoch:train:401-500batch: iter_time=1.104e-04, loss_ctc=82.217, loss_att=57.348, acc=0.700, loss=64.801, grad_norm=5.338, loss_scale=1.000, learning_rate=9.420e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 02:03:21,059 (deepspeed_trainer:228) INFO: 19epoch:train:501-600batch: iter_time=1.086e-04, loss_ctc=83.670, loss_att=62.866, acc=0.688, loss=69.076, grad_norm=5.084, loss_scale=1.000, learning_rate=9.418e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:03:58,485 (deepspeed_trainer:228) INFO: 19epoch:train:601-700batch: iter_time=1.085e-04, loss_ctc=75.903, loss_att=64.704, acc=0.695, loss=68.053, grad_norm=5.148, loss_scale=1.000, learning_rate=9.417e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:04:36,021 (deepspeed_trainer:228) INFO: 19epoch:train:701-800batch: iter_time=1.077e-04, loss_ctc=80.720, loss_att=61.629, acc=0.704, loss=67.365, grad_norm=4.308, loss_scale=1.000, learning_rate=9.415e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 02:05:13,075 (deepspeed_trainer:228) INFO: 19epoch:train:801-900batch: iter_time=1.073e-04, loss_ctc=76.850, loss_att=59.782, acc=0.695, loss=64.907, grad_norm=4.438, loss_scale=1.000, learning_rate=9.413e-05, step_time=0.370 [2024-12-07 02:05:49,878] [INFO] [logging.py:129:log_dist] [Rank 0] step=271000, skipped=0, lr=[np.float64(9.410661975500244e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:05:49,879] [INFO] [timer.py:264:stop] epoch=0/micro_step=16000/global_step=16000, RunningAvgSamplesPerSec=43.98329286956276, CurrSamplesPerSec=41.14891242913432, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:05:49,880 (deepspeed_trainer:228) INFO: 19epoch:train:901-1000batch: iter_time=1.105e-04, loss_ctc=62.933, loss_att=47.352, acc=0.714, loss=52.016, grad_norm=4.032, loss_scale=1.000, learning_rate=9.412e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 02:06:26,748 (deepspeed_trainer:228) INFO: 19epoch:train:1001-1100batch: iter_time=1.086e-04, loss_ctc=84.468, loss_att=59.740, acc=0.696, loss=67.183, grad_norm=4.973, loss_scale=1.000, learning_rate=9.410e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:07:03,902 (deepspeed_trainer:228) INFO: 19epoch:train:1101-1200batch: iter_time=1.096e-04, loss_ctc=72.406, loss_att=55.779, acc=0.682, loss=60.773, grad_norm=5.414, loss_scale=1.000, learning_rate=9.408e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:07:40,695 (deepspeed_trainer:228) INFO: 19epoch:train:1201-1300batch: iter_time=1.094e-04, loss_ctc=74.707, loss_att=53.060, acc=0.704, loss=59.532, grad_norm=4.737, loss_scale=1.000, learning_rate=9.406e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:08:17,536 (deepspeed_trainer:228) INFO: 19epoch:train:1301-1400batch: iter_time=1.081e-04, loss_ctc=66.239, loss_att=46.520, acc=0.717, loss=52.450, grad_norm=4.157, loss_scale=1.000, learning_rate=9.405e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:08:54,602 (deepspeed_trainer:228) INFO: 19epoch:train:1401-1500batch: iter_time=1.083e-04, loss_ctc=71.838, loss_att=53.692, acc=0.686, loss=59.135, grad_norm=4.541, loss_scale=1.000, learning_rate=9.403e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:09:32,491 (deepspeed_trainer:228) INFO: 19epoch:train:1501-1600batch: iter_time=1.092e-04, loss_ctc=88.686, loss_att=76.189, acc=0.674, loss=79.901, grad_norm=5.362, loss_scale=1.000, learning_rate=9.401e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 02:10:10,094 (deepspeed_trainer:228) INFO: 19epoch:train:1601-1700batch: iter_time=1.104e-04, loss_ctc=75.662, loss_att=60.876, acc=0.703, loss=65.312, grad_norm=4.911, loss_scale=1.000, learning_rate=9.399e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 02:10:47,804 (deepspeed_trainer:228) INFO: 19epoch:train:1701-1800batch: iter_time=1.118e-04, loss_ctc=81.474, loss_att=68.409, acc=0.692, loss=72.298, grad_norm=4.391, loss_scale=1.000, learning_rate=9.398e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 02:11:21,717 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 02:11:47,390 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 02:12:04,983 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 02:12:04,983 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 02:12:04,985 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 02:12:36,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:37,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:37,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:37,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:37,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:37,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:37,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:35,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:35,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:12:36,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:28,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:31,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:31,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:29,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:29,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:32,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:32,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:32,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:32,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:13:30,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:16,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:18,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:16,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:18,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:19,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:19,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:18,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:18,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:20,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:20,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:19,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:19,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:21,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:22,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:22,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:14:23,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:05,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:03,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:06,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:05,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:07,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:05,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:08,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:06,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:09,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:10,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:08,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:10,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:08,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:12,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:12,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:15:13,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 02:15:51,328 (deepspeed_trainer:228) INFO: 19epoch:train:1801-1900batch: iter_time=2.613, loss_ctc=87.226, loss_att=73.353, acc=0.680, loss=77.532, grad_norm=5.147, loss_scale=1.000, learning_rate=9.396e-05, step_time=0.422 [2024-12-07 02:16:29,340] [INFO] [logging.py:129:log_dist] [Rank 0] step=272000, skipped=0, lr=[np.float64(9.393347099110946e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:16:29,342] [INFO] [timer.py:264:stop] epoch=0/micro_step=17000/global_step=17000, RunningAvgSamplesPerSec=43.946616510088155, CurrSamplesPerSec=45.000026993982296, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:16:29,344 (deepspeed_trainer:228) INFO: 19epoch:train:1901-2000batch: iter_time=1.053e-04, loss_ctc=69.078, loss_att=55.190, acc=0.707, loss=59.360, grad_norm=4.086, loss_scale=1.000, learning_rate=9.394e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 02:17:06,964 (deepspeed_trainer:228) INFO: 19epoch:train:2001-2100batch: iter_time=1.033e-04, loss_ctc=72.639, loss_att=50.281, acc=0.712, loss=56.967, grad_norm=4.777, loss_scale=1.000, learning_rate=9.392e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 02:17:44,737 (deepspeed_trainer:228) INFO: 19epoch:train:2101-2200batch: iter_time=1.090e-04, loss_ctc=80.032, loss_att=61.471, acc=0.697, loss=66.998, grad_norm=5.410, loss_scale=1.000, learning_rate=9.391e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 02:18:22,455 (deepspeed_trainer:228) INFO: 19epoch:train:2201-2300batch: iter_time=1.094e-04, loss_ctc=73.815, loss_att=55.574, acc=0.698, loss=61.053, grad_norm=4.698, loss_scale=1.000, learning_rate=9.389e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 02:18:59,827 (deepspeed_trainer:228) INFO: 19epoch:train:2301-2400batch: iter_time=1.091e-04, loss_ctc=76.231, loss_att=54.689, acc=0.700, loss=61.166, grad_norm=4.911, loss_scale=1.000, learning_rate=9.387e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 02:19:38,277 (deepspeed_trainer:228) INFO: 19epoch:train:2401-2500batch: iter_time=1.102e-04, loss_ctc=91.028, loss_att=70.447, acc=0.681, loss=76.657, grad_norm=5.450, loss_scale=1.000, learning_rate=9.386e-05, step_time=0.384 [cnode7-012:0/16] 2024-12-07 02:20:15,986 (deepspeed_trainer:228) INFO: 19epoch:train:2501-2600batch: iter_time=1.094e-04, loss_ctc=69.806, loss_att=58.308, acc=0.710, loss=61.773, grad_norm=4.255, loss_scale=1.000, learning_rate=9.384e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 02:20:53,603 (deepspeed_trainer:228) INFO: 19epoch:train:2601-2700batch: iter_time=1.093e-04, loss_ctc=83.042, loss_att=62.222, acc=0.700, loss=68.484, grad_norm=4.799, loss_scale=1.000, learning_rate=9.382e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 02:21:31,088 (deepspeed_trainer:228) INFO: 19epoch:train:2701-2800batch: iter_time=1.106e-04, loss_ctc=73.399, loss_att=56.971, acc=0.702, loss=61.875, grad_norm=4.659, loss_scale=1.000, learning_rate=9.380e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 02:22:08,292 (deepspeed_trainer:228) INFO: 19epoch:train:2801-2900batch: iter_time=1.098e-04, loss_ctc=64.229, loss_att=46.261, acc=0.721, loss=51.660, grad_norm=3.766, loss_scale=1.000, learning_rate=9.379e-05, step_time=0.372 [2024-12-07 02:22:45,795] [INFO] [logging.py:129:log_dist] [Rank 0] step=273000, skipped=0, lr=[np.float64(9.376127446391043e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:22:45,795] [INFO] [timer.py:264:stop] epoch=0/micro_step=18000/global_step=18000, RunningAvgSamplesPerSec=43.925087796396184, CurrSamplesPerSec=44.246216097948086, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:22:45,797 (deepspeed_trainer:228) INFO: 19epoch:train:2901-3000batch: iter_time=1.104e-04, loss_ctc=83.947, loss_att=61.421, acc=0.676, loss=68.186, grad_norm=5.466, loss_scale=1.000, learning_rate=9.377e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 02:23:22,755 (deepspeed_trainer:228) INFO: 19epoch:train:3001-3100batch: iter_time=1.083e-04, loss_ctc=70.515, loss_att=52.023, acc=0.704, loss=57.568, grad_norm=4.463, loss_scale=1.000, learning_rate=9.375e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:23:59,385 (deepspeed_trainer:228) INFO: 19epoch:train:3101-3200batch: iter_time=1.084e-04, loss_ctc=72.016, loss_att=52.568, acc=0.712, loss=58.428, grad_norm=4.495, loss_scale=1.000, learning_rate=9.374e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 02:24:36,481 (deepspeed_trainer:228) INFO: 19epoch:train:3201-3300batch: iter_time=1.085e-04, loss_ctc=66.239, loss_att=48.256, acc=0.713, loss=53.655, grad_norm=4.255, loss_scale=1.000, learning_rate=9.372e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:25:13,818 (deepspeed_trainer:228) INFO: 19epoch:train:3301-3400batch: iter_time=1.088e-04, loss_ctc=75.906, loss_att=56.667, acc=0.698, loss=62.432, grad_norm=4.829, loss_scale=1.000, learning_rate=9.370e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 02:25:51,456 (deepspeed_trainer:228) INFO: 19epoch:train:3401-3500batch: iter_time=1.090e-04, loss_ctc=88.819, loss_att=76.232, acc=0.675, loss=79.978, grad_norm=5.687, loss_scale=1.000, learning_rate=9.368e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 02:26:28,497 (deepspeed_trainer:228) INFO: 19epoch:train:3501-3600batch: iter_time=1.094e-04, loss_ctc=74.040, loss_att=56.480, acc=0.708, loss=61.723, grad_norm=4.714, loss_scale=1.000, learning_rate=9.367e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 02:27:06,429 (deepspeed_trainer:228) INFO: 19epoch:train:3601-3700batch: iter_time=1.087e-04, loss_ctc=84.824, loss_att=76.748, acc=0.681, loss=79.164, grad_norm=5.468, loss_scale=1.000, learning_rate=9.365e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 02:27:28,837 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 02:27:54,951 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 02:28:10,693 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 02:28:10,693 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 02:28:10,695 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 02:28:36,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:36,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:37,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:35,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:38,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:36,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:36,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:38,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:38,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:39,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:39,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:37,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:37,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:37,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:38,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:28:38,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:24,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:24,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:25,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:26,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:24,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:24,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:24,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:25,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:27,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:25,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:28,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:25,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:28,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:28,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:26,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:29:29,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:10,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:11,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:12,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:13,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:11,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:12,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:12,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:12,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:13,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:16,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:13,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:16,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:16,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:16,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:16,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:23,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:56,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:57,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:58,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:01,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:59,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:59,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:30:59,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:00,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:01,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:03,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:01,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:04,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:04,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:04,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:06,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:31:16,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 02:32:02,796 (deepspeed_trainer:228) INFO: 19epoch:train:3701-3800batch: iter_time=2.462, loss_ctc=77.686, loss_att=63.443, acc=0.691, loss=67.753, grad_norm=4.922, loss_scale=1.000, learning_rate=9.363e-05, step_time=0.501 [cnode7-012:0/16] 2024-12-07 02:32:39,911 (deepspeed_trainer:228) INFO: 19epoch:train:3801-3900batch: iter_time=1.107e-04, loss_ctc=66.798, loss_att=52.386, acc=0.697, loss=56.699, grad_norm=4.000, loss_scale=1.000, learning_rate=9.362e-05, step_time=0.371 [2024-12-07 02:33:16,741] [INFO] [logging.py:129:log_dist] [Rank 0] step=274000, skipped=0, lr=[np.float64(9.359002147719463e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:33:16,741] [INFO] [timer.py:264:stop] epoch=0/micro_step=19000/global_step=19000, RunningAvgSamplesPerSec=43.85421386994565, CurrSamplesPerSec=43.95044371008912, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:33:16,743 (deepspeed_trainer:228) INFO: 19epoch:train:3901-4000batch: iter_time=1.101e-04, loss_ctc=72.922, loss_att=52.384, acc=0.706, loss=58.553, grad_norm=4.430, loss_scale=1.000, learning_rate=9.360e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:33:53,748 (deepspeed_trainer:228) INFO: 19epoch:train:4001-4100batch: iter_time=1.094e-04, loss_ctc=79.807, loss_att=58.116, acc=0.696, loss=64.641, grad_norm=4.897, loss_scale=1.000, learning_rate=9.358e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 02:34:30,666 (deepspeed_trainer:228) INFO: 19epoch:train:4101-4200batch: iter_time=1.159e-04, loss_ctc=74.316, loss_att=57.066, acc=0.691, loss=62.244, grad_norm=4.962, loss_scale=1.000, learning_rate=9.356e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:35:07,538 (deepspeed_trainer:228) INFO: 19epoch:train:4201-4300batch: iter_time=1.093e-04, loss_ctc=79.115, loss_att=59.225, acc=0.699, loss=65.176, grad_norm=4.984, loss_scale=1.000, learning_rate=9.355e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:35:44,425 (deepspeed_trainer:228) INFO: 19epoch:train:4301-4400batch: iter_time=1.085e-04, loss_ctc=86.289, loss_att=67.921, acc=0.683, loss=73.435, grad_norm=4.675, loss_scale=1.000, learning_rate=9.353e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:36:21,884 (deepspeed_trainer:228) INFO: 19epoch:train:4401-4500batch: iter_time=1.125e-04, loss_ctc=78.394, loss_att=63.046, acc=0.701, loss=67.680, grad_norm=4.461, loss_scale=1.000, learning_rate=9.351e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:36:58,749 (deepspeed_trainer:228) INFO: 19epoch:train:4501-4600batch: iter_time=1.105e-04, loss_ctc=75.869, loss_att=58.990, acc=0.692, loss=64.047, grad_norm=4.743, loss_scale=1.000, learning_rate=9.350e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:37:35,641 (deepspeed_trainer:228) INFO: 19epoch:train:4601-4700batch: iter_time=1.116e-04, loss_ctc=72.830, loss_att=53.360, acc=0.706, loss=59.183, grad_norm=5.031, loss_scale=1.000, learning_rate=9.348e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:38:12,225 (deepspeed_trainer:228) INFO: 19epoch:train:4701-4800batch: iter_time=1.098e-04, loss_ctc=68.750, loss_att=47.902, acc=0.710, loss=54.152, grad_norm=4.411, loss_scale=1.000, learning_rate=9.346e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 02:38:48,970 (deepspeed_trainer:228) INFO: 19epoch:train:4801-4900batch: iter_time=1.082e-04, loss_ctc=73.967, loss_att=54.396, acc=0.686, loss=60.285, grad_norm=4.798, loss_scale=1.000, learning_rate=9.345e-05, step_time=0.367 [2024-12-07 02:39:25,681] [INFO] [logging.py:129:log_dist] [Rank 0] step=275000, skipped=0, lr=[np.float64(9.341970344553092e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:39:25,682] [INFO] [timer.py:264:stop] epoch=0/micro_step=20000/global_step=20000, RunningAvgSamplesPerSec=43.884907414057274, CurrSamplesPerSec=45.17542055879205, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:39:25,683 (deepspeed_trainer:228) INFO: 19epoch:train:4901-5000batch: iter_time=1.107e-04, loss_ctc=71.172, loss_att=50.569, acc=0.708, loss=56.755, grad_norm=4.428, loss_scale=1.000, learning_rate=9.343e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 02:40:03,013 (deepspeed_trainer:228) INFO: 19epoch:train:5001-5100batch: iter_time=1.108e-04, loss_ctc=69.213, loss_att=52.452, acc=0.709, loss=57.476, grad_norm=4.256, loss_scale=1.000, learning_rate=9.341e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 02:40:39,708 (deepspeed_trainer:228) INFO: 19epoch:train:5101-5200batch: iter_time=1.121e-04, loss_ctc=68.579, loss_att=51.785, acc=0.699, loss=56.807, grad_norm=4.408, loss_scale=1.000, learning_rate=9.339e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 02:41:16,750 (deepspeed_trainer:228) INFO: 19epoch:train:5201-5300batch: iter_time=1.111e-04, loss_ctc=80.124, loss_att=60.593, acc=0.690, loss=66.469, grad_norm=4.591, loss_scale=1.000, learning_rate=9.338e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 02:41:53,863 (deepspeed_trainer:228) INFO: 19epoch:train:5301-5400batch: iter_time=1.100e-04, loss_ctc=84.420, loss_att=72.787, acc=0.679, loss=76.254, grad_norm=4.808, loss_scale=1.000, learning_rate=9.336e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:42:30,875 (deepspeed_trainer:228) INFO: 19epoch:train:5401-5500batch: iter_time=1.120e-04, loss_ctc=77.717, loss_att=61.682, acc=0.699, loss=66.495, grad_norm=4.381, loss_scale=1.000, learning_rate=9.334e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 02:43:07,743 (deepspeed_trainer:228) INFO: 19epoch:train:5501-5600batch: iter_time=1.104e-04, loss_ctc=81.960, loss_att=69.194, acc=0.688, loss=73.041, grad_norm=5.487, loss_scale=1.000, learning_rate=9.333e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:43:20,653 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 02:43:46,621 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 02:44:02,944 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 02:44:02,944 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 02:44:02,947 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 02:44:25,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:28,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:28,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:27,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:29,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:29,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:27,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:28,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:30,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:28,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:28,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:30,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:31,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:29,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:31,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:44:29,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:12,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:16,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:16,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:17,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:14,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:16,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:18,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:18,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:16,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:19,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:17,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:19,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:17,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:17,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:17,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:45:20,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:00,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:04,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:02,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:04,699] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:03,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:06,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:06,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:04,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:06,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:04,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:07,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:07,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:08,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:05,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:06,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:06,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:47,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:51,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:52,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:50,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:51,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:54,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:53,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:53,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:55,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:54,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:56,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:54,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:57,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:55,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:57,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 02:46:58,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 02:47:53,671 (deepspeed_trainer:228) INFO: 19epoch:train:5601-5700batch: iter_time=2.465, loss_ctc=75.903, loss_att=62.220, acc=0.688, loss=66.344, grad_norm=4.713, loss_scale=1.000, learning_rate=9.331e-05, step_time=0.394 [cnode7-012:0/16] 2024-12-07 02:48:30,632 (deepspeed_trainer:228) INFO: 19epoch:train:5701-5800batch: iter_time=1.093e-04, loss_ctc=67.735, loss_att=48.678, acc=0.704, loss=54.397, grad_norm=4.713, loss_scale=1.000, learning_rate=9.329e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:49:07,918 (deepspeed_trainer:228) INFO: 19epoch:train:5801-5900batch: iter_time=1.141e-04, loss_ctc=80.118, loss_att=57.734, acc=0.703, loss=64.441, grad_norm=4.451, loss_scale=1.000, learning_rate=9.328e-05, step_time=0.373 [2024-12-07 02:49:45,487] [INFO] [logging.py:129:log_dist] [Rank 0] step=276000, skipped=0, lr=[np.float64(9.325031189245996e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:49:45,488] [INFO] [timer.py:264:stop] epoch=0/micro_step=21000/global_step=21000, RunningAvgSamplesPerSec=43.88952116330041, CurrSamplesPerSec=41.265887564910166, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:49:45,491 (deepspeed_trainer:228) INFO: 19epoch:train:5901-6000batch: iter_time=1.152e-04, loss_ctc=74.878, loss_att=55.725, acc=0.700, loss=61.463, grad_norm=4.474, loss_scale=1.000, learning_rate=9.326e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 02:50:22,903 (deepspeed_trainer:228) INFO: 19epoch:train:6001-6100batch: iter_time=1.148e-04, loss_ctc=76.476, loss_att=53.387, acc=0.703, loss=60.315, grad_norm=4.713, loss_scale=1.000, learning_rate=9.324e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:51:00,059 (deepspeed_trainer:228) INFO: 19epoch:train:6101-6200batch: iter_time=1.117e-04, loss_ctc=74.888, loss_att=58.384, acc=0.694, loss=63.337, grad_norm=4.831, loss_scale=1.000, learning_rate=9.322e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:51:37,514 (deepspeed_trainer:228) INFO: 19epoch:train:6201-6300batch: iter_time=1.145e-04, loss_ctc=84.302, loss_att=64.611, acc=0.694, loss=70.556, grad_norm=4.985, loss_scale=1.000, learning_rate=9.321e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:52:14,985 (deepspeed_trainer:228) INFO: 19epoch:train:6301-6400batch: iter_time=1.142e-04, loss_ctc=80.466, loss_att=63.537, acc=0.696, loss=68.635, grad_norm=4.815, loss_scale=1.000, learning_rate=9.319e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 02:52:51,833 (deepspeed_trainer:228) INFO: 19epoch:train:6401-6500batch: iter_time=1.125e-04, loss_ctc=79.159, loss_att=61.788, acc=0.699, loss=66.979, grad_norm=4.593, loss_scale=1.000, learning_rate=9.317e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 02:53:28,826 (deepspeed_trainer:228) INFO: 19epoch:train:6501-6600batch: iter_time=1.215e-04, loss_ctc=64.743, loss_att=46.582, acc=0.711, loss=52.053, grad_norm=3.997, loss_scale=1.000, learning_rate=9.316e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:54:05,948 (deepspeed_trainer:228) INFO: 19epoch:train:6601-6700batch: iter_time=1.175e-04, loss_ctc=78.117, loss_att=54.144, acc=0.700, loss=61.328, grad_norm=4.791, loss_scale=1.000, learning_rate=9.314e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:54:42,564 (deepspeed_trainer:228) INFO: 19epoch:train:6701-6800batch: iter_time=1.119e-04, loss_ctc=68.972, loss_att=53.850, acc=0.689, loss=58.377, grad_norm=4.304, loss_scale=1.000, learning_rate=9.312e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 02:55:19,204 (deepspeed_trainer:228) INFO: 19epoch:train:6801-6900batch: iter_time=1.085e-04, loss_ctc=72.136, loss_att=49.061, acc=0.715, loss=55.975, grad_norm=4.047, loss_scale=1.000, learning_rate=9.311e-05, step_time=0.366 [2024-12-07 02:55:55,757] [INFO] [logging.py:129:log_dist] [Rank 0] step=277000, skipped=0, lr=[np.float64(9.308183844872236e-05)], mom=[[0.9, 0.98]] [2024-12-07 02:55:55,758] [INFO] [timer.py:264:stop] epoch=0/micro_step=22000/global_step=22000, RunningAvgSamplesPerSec=43.91153705746055, CurrSamplesPerSec=46.073620907188094, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 02:55:55,759 (deepspeed_trainer:228) INFO: 19epoch:train:6901-7000batch: iter_time=1.091e-04, loss_ctc=64.677, loss_att=46.123, acc=0.715, loss=51.695, grad_norm=4.164, loss_scale=1.000, learning_rate=9.309e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 02:56:32,885 (deepspeed_trainer:228) INFO: 19epoch:train:7001-7100batch: iter_time=1.083e-04, loss_ctc=70.772, loss_att=54.916, acc=0.691, loss=59.694, grad_norm=4.719, loss_scale=1.000, learning_rate=9.307e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 02:57:10,396 (deepspeed_trainer:228) INFO: 19epoch:train:7101-7200batch: iter_time=1.082e-04, loss_ctc=81.758, loss_att=64.438, acc=0.685, loss=69.647, grad_norm=4.956, loss_scale=1.000, learning_rate=9.306e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 02:57:47,587 (deepspeed_trainer:228) INFO: 19epoch:train:7201-7300batch: iter_time=1.083e-04, loss_ctc=81.743, loss_att=65.636, acc=0.690, loss=70.463, grad_norm=4.629, loss_scale=1.000, learning_rate=9.304e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 02:58:24,954 (deepspeed_trainer:228) INFO: 19epoch:train:7301-7400batch: iter_time=1.144e-04, loss_ctc=76.837, loss_att=64.845, acc=0.699, loss=68.424, grad_norm=4.235, loss_scale=1.000, learning_rate=9.302e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 02:59:01,917 (deepspeed_trainer:228) INFO: 19epoch:train:7401-7500batch: iter_time=1.110e-04, loss_ctc=82.899, loss_att=73.254, acc=0.673, loss=76.203, grad_norm=5.817, loss_scale=1.000, learning_rate=9.301e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 02:59:05,722 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 02:59:32,252 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 02:59:47,858 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 02:59:47,858 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 02:59:47,860 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 03:00:13,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:13,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:14,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:12,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:14,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:12,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:13,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:13,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:15,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:15,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:13,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:16,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:13,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:16,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:14,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:00:14,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:01,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:01,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:00,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:03,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:01,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:01,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:02,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:04,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:04,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:02,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:02,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:05,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:02,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:05,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:06,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:05,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:48,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:48,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:47,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:48,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:51,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:49,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:49,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:50,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:53,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:51,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:53,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:51,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:53,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:53,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:55,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:01:55,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:35,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:34,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:37,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:36,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:39,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:40,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:38,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:38,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:38,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:38,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:41,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:39,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:42,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:42,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:44,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:02:45,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 03:03:50,539 (deepspeed_trainer:228) INFO: 19epoch:train:7501-7600batch: iter_time=2.468, loss_ctc=75.592, loss_att=55.482, acc=0.710, loss=61.525, grad_norm=4.240, loss_scale=1.000, learning_rate=9.299e-05, step_time=0.418 [cnode7-012:0/16] 2024-12-07 03:04:27,685 (deepspeed_trainer:228) INFO: 19epoch:train:7601-7700batch: iter_time=1.082e-04, loss_ctc=69.606, loss_att=49.121, acc=0.706, loss=55.256, grad_norm=4.463, loss_scale=1.000, learning_rate=9.297e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:05:04,568 (deepspeed_trainer:228) INFO: 19epoch:train:7701-7800batch: iter_time=1.108e-04, loss_ctc=75.671, loss_att=55.481, acc=0.705, loss=61.528, grad_norm=4.770, loss_scale=1.000, learning_rate=9.296e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:05:41,470 (deepspeed_trainer:228) INFO: 19epoch:train:7801-7900batch: iter_time=1.094e-04, loss_ctc=74.423, loss_att=57.420, acc=0.694, loss=62.528, grad_norm=4.575, loss_scale=1.000, learning_rate=9.294e-05, step_time=0.368 [2024-12-07 03:06:19,020] [INFO] [logging.py:129:log_dist] [Rank 0] step=278000, skipped=0, lr=[np.float64(9.291427485052179e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:06:19,021] [INFO] [timer.py:264:stop] epoch=0/micro_step=23000/global_step=23000, RunningAvgSamplesPerSec=43.896142129603575, CurrSamplesPerSec=46.97749498932108, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:06:19,022 (deepspeed_trainer:228) INFO: 19epoch:train:7901-8000batch: iter_time=1.091e-04, loss_ctc=80.583, loss_att=55.940, acc=0.704, loss=63.307, grad_norm=5.251, loss_scale=1.000, learning_rate=9.292e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 03:06:56,114 (deepspeed_trainer:228) INFO: 19epoch:train:8001-8100batch: iter_time=1.098e-04, loss_ctc=80.668, loss_att=61.034, acc=0.689, loss=66.933, grad_norm=4.820, loss_scale=1.000, learning_rate=9.291e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:07:33,121 (deepspeed_trainer:228) INFO: 19epoch:train:8101-8200batch: iter_time=1.144e-04, loss_ctc=73.248, loss_att=62.536, acc=0.698, loss=65.763, grad_norm=4.259, loss_scale=1.000, learning_rate=9.289e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:08:10,102 (deepspeed_trainer:228) INFO: 19epoch:train:8201-8300batch: iter_time=1.105e-04, loss_ctc=78.981, loss_att=59.616, acc=0.704, loss=65.425, grad_norm=4.814, loss_scale=1.000, learning_rate=9.287e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:08:47,089 (deepspeed_trainer:228) INFO: 19epoch:train:8301-8400batch: iter_time=1.145e-04, loss_ctc=75.752, loss_att=58.677, acc=0.693, loss=63.809, grad_norm=4.383, loss_scale=1.000, learning_rate=9.286e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:09:23,665 (deepspeed_trainer:228) INFO: 19epoch:train:8401-8500batch: iter_time=1.084e-04, loss_ctc=61.887, loss_att=45.348, acc=0.718, loss=50.323, grad_norm=4.563, loss_scale=1.000, learning_rate=9.284e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 03:10:00,758 (deepspeed_trainer:228) INFO: 19epoch:train:8501-8600batch: iter_time=1.087e-04, loss_ctc=82.174, loss_att=56.911, acc=0.702, loss=64.508, grad_norm=4.718, loss_scale=1.000, learning_rate=9.282e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:10:37,871 (deepspeed_trainer:228) INFO: 19epoch:train:8601-8700batch: iter_time=1.100e-04, loss_ctc=69.304, loss_att=52.518, acc=0.686, loss=57.559, grad_norm=4.498, loss_scale=1.000, learning_rate=9.281e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:11:15,033 (deepspeed_trainer:228) INFO: 19epoch:train:8701-8800batch: iter_time=1.086e-04, loss_ctc=71.319, loss_att=50.743, acc=0.710, loss=56.924, grad_norm=4.580, loss_scale=1.000, learning_rate=9.279e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:11:52,153 (deepspeed_trainer:228) INFO: 19epoch:train:8801-8900batch: iter_time=1.085e-04, loss_ctc=65.548, loss_att=46.148, acc=0.717, loss=51.977, grad_norm=3.670, loss_scale=1.000, learning_rate=9.277e-05, step_time=0.371 [2024-12-07 03:12:29,254] [INFO] [logging.py:129:log_dist] [Rank 0] step=279000, skipped=0, lr=[np.float64(9.274761293782248e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:12:29,255] [INFO] [timer.py:264:stop] epoch=0/micro_step=24000/global_step=24000, RunningAvgSamplesPerSec=43.91391634924997, CurrSamplesPerSec=46.36366493466381, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:12:29,257 (deepspeed_trainer:228) INFO: 19epoch:train:8901-9000batch: iter_time=1.094e-04, loss_ctc=70.199, loss_att=52.078, acc=0.692, loss=57.524, grad_norm=4.443, loss_scale=1.000, learning_rate=9.276e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:13:06,965 (deepspeed_trainer:228) INFO: 19epoch:train:9001-9100batch: iter_time=1.096e-04, loss_ctc=86.516, loss_att=73.955, acc=0.678, loss=77.687, grad_norm=4.749, loss_scale=1.000, learning_rate=9.274e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 03:13:44,369 (deepspeed_trainer:228) INFO: 19epoch:train:9101-9200batch: iter_time=1.064e-04, loss_ctc=74.934, loss_att=59.795, acc=0.700, loss=64.364, grad_norm=4.483, loss_scale=1.000, learning_rate=9.272e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 03:14:21,711 (deepspeed_trainer:228) INFO: 19epoch:train:9201-9300batch: iter_time=1.067e-04, loss_ctc=80.005, loss_att=65.499, acc=0.697, loss=69.826, grad_norm=4.728, loss_scale=1.000, learning_rate=9.271e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 03:14:53,008 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 03:15:19,550 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 03:15:36,970 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 03:15:36,970 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 03:15:36,972 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 03:16:01,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:15:59,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:00,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:00,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:03,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:03,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:01,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:04,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:02,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:49,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:50,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:50,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:48,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:51,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:49,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:52,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:52,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:49,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:50,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:52,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:50,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:53,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:51,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:51,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:16:54,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:37,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:37,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:39,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:40,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:40,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:38,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:38,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:40,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:38,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:41,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:38,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:41,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:39,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:39,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:40,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:17:45,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:24,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:25,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:28,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:28,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:26,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:28,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:28,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:26,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:29,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:26,901] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:27,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:27,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:27,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:28,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:31,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:18:34,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 03:19:12,527 (deepspeed_trainer:228) INFO: 19epoch:train:9301-9400batch: iter_time=2.469, loss_ctc=85.062, loss_att=71.265, acc=0.680, loss=75.455, grad_norm=5.496, loss_scale=1.000, learning_rate=9.269e-05, step_time=0.439 [cnode7-012:0/16] 2024-12-07 03:19:49,685 (deepspeed_trainer:228) INFO: 19epoch:train:9401-9500batch: iter_time=1.058e-04, loss_ctc=68.255, loss_att=51.544, acc=0.709, loss=56.571, grad_norm=4.375, loss_scale=1.000, learning_rate=9.267e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:20:26,386 (deepspeed_trainer:228) INFO: 19epoch:train:9501-9600batch: iter_time=1.059e-04, loss_ctc=71.643, loss_att=50.513, acc=0.711, loss=56.824, grad_norm=4.476, loss_scale=1.000, learning_rate=9.266e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 03:21:03,489 (deepspeed_trainer:228) INFO: 19epoch:train:9601-9700batch: iter_time=1.030e-04, loss_ctc=77.289, loss_att=57.638, acc=0.703, loss=63.527, grad_norm=4.642, loss_scale=1.000, learning_rate=9.264e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:21:40,165 (deepspeed_trainer:228) INFO: 19epoch:train:9701-9800batch: iter_time=1.050e-04, loss_ctc=73.769, loss_att=54.893, acc=0.697, loss=60.573, grad_norm=4.771, loss_scale=1.000, learning_rate=9.262e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 03:22:17,198 (deepspeed_trainer:228) INFO: 19epoch:train:9801-9900batch: iter_time=1.067e-04, loss_ctc=75.409, loss_att=53.563, acc=0.703, loss=60.142, grad_norm=5.056, loss_scale=1.000, learning_rate=9.261e-05, step_time=0.370 [2024-12-07 03:22:54,329] [INFO] [logging.py:129:log_dist] [Rank 0] step=280000, skipped=0, lr=[np.float64(9.258184465268017e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:22:54,330] [INFO] [timer.py:264:stop] epoch=0/micro_step=25000/global_step=25000, RunningAvgSamplesPerSec=43.89108716057815, CurrSamplesPerSec=44.28376365982916, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:22:54,332 (deepspeed_trainer:228) INFO: 19epoch:train:9901-10000batch: iter_time=1.101e-04, loss_ctc=89.480, loss_att=67.980, acc=0.684, loss=74.441, grad_norm=5.126, loss_scale=1.000, learning_rate=9.259e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:23:31,296 (deepspeed_trainer:228) INFO: 19epoch:train:10001-10100batch: iter_time=1.103e-04, loss_ctc=68.976, loss_att=57.301, acc=0.711, loss=60.801, grad_norm=4.656, loss_scale=1.000, learning_rate=9.257e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:24:08,361 (deepspeed_trainer:228) INFO: 19epoch:train:10101-10200batch: iter_time=1.080e-04, loss_ctc=82.289, loss_att=60.653, acc=0.698, loss=67.144, grad_norm=4.377, loss_scale=1.000, learning_rate=9.256e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:24:45,052 (deepspeed_trainer:228) INFO: 19epoch:train:10201-10300batch: iter_time=1.061e-04, loss_ctc=72.137, loss_att=56.060, acc=0.700, loss=60.853, grad_norm=4.807, loss_scale=1.000, learning_rate=9.254e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 03:25:22,064 (deepspeed_trainer:228) INFO: 19epoch:train:10301-10400batch: iter_time=1.066e-04, loss_ctc=63.820, loss_att=45.655, acc=0.721, loss=51.125, grad_norm=3.996, loss_scale=1.000, learning_rate=9.252e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:25:59,017 (deepspeed_trainer:228) INFO: 19epoch:train:10401-10500batch: iter_time=1.074e-04, loss_ctc=82.049, loss_att=59.041, acc=0.680, loss=65.942, grad_norm=5.168, loss_scale=1.000, learning_rate=9.251e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:26:35,669 (deepspeed_trainer:228) INFO: 19epoch:train:10501-10600batch: iter_time=1.064e-04, loss_ctc=69.294, loss_att=50.097, acc=0.706, loss=55.882, grad_norm=4.756, loss_scale=1.000, learning_rate=9.249e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 03:27:12,484 (deepspeed_trainer:228) INFO: 19epoch:train:10601-10700batch: iter_time=1.074e-04, loss_ctc=70.737, loss_att=51.849, acc=0.712, loss=57.514, grad_norm=4.329, loss_scale=1.000, learning_rate=9.247e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 03:27:49,157 (deepspeed_trainer:228) INFO: 19epoch:train:10701-10800batch: iter_time=1.070e-04, loss_ctc=65.708, loss_att=48.665, acc=0.709, loss=53.789, grad_norm=3.770, loss_scale=1.000, learning_rate=9.246e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 03:28:26,077 (deepspeed_trainer:228) INFO: 19epoch:train:10801-10900batch: iter_time=1.052e-04, loss_ctc=74.826, loss_att=56.212, acc=0.698, loss=61.811, grad_norm=4.902, loss_scale=1.000, learning_rate=9.244e-05, step_time=0.369 [2024-12-07 03:29:03,398] [INFO] [logging.py:129:log_dist] [Rank 0] step=281000, skipped=0, lr=[np.float64(9.241696203760563e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:29:03,399] [INFO] [timer.py:264:stop] epoch=0/micro_step=26000/global_step=26000, RunningAvgSamplesPerSec=43.913719472547314, CurrSamplesPerSec=40.22485555077683, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:29:03,400 (deepspeed_trainer:228) INFO: 19epoch:train:10901-11000batch: iter_time=1.126e-04, loss_ctc=87.147, loss_att=75.223, acc=0.675, loss=78.817, grad_norm=5.448, loss_scale=1.000, learning_rate=9.243e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:29:40,358 (deepspeed_trainer:228) INFO: 19epoch:train:11001-11100batch: iter_time=1.176e-04, loss_ctc=73.369, loss_att=55.947, acc=0.706, loss=61.173, grad_norm=4.277, loss_scale=1.000, learning_rate=9.241e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:30:17,535 (deepspeed_trainer:228) INFO: 19epoch:train:11101-11200batch: iter_time=1.181e-04, loss_ctc=83.308, loss_att=75.003, acc=0.684, loss=77.511, grad_norm=5.392, loss_scale=1.000, learning_rate=9.239e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:30:40,434 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 03:31:07,243 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 03:31:23,387 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 03:31:23,387 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 03:31:23,389 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 03:31:48,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:48,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:46,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:49,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:47,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:47,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:50,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:47,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:50,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:50,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:51,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:51,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:48,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:49,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:49,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:31:49,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:35,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:35,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:35,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:35,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:36,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:38,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:36,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:39,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:36,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:39,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:39,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:37,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:37,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:40,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:40,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:32:38,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:22,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:23,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:23,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:26,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:24,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:27,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:24,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:27,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:25,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:25,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:28,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:26,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:26,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:26,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:30,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:33:31,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:08,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:10,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:11,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:14,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:14,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:12,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:13,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:13,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:16,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:13,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:16,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:14,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:14,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:14,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:20,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:34:22,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 03:35:07,203 (deepspeed_trainer:228) INFO: 19epoch:train:11201-11300batch: iter_time=2.465, loss_ctc=76.416, loss_att=62.968, acc=0.693, loss=66.985, grad_norm=4.479, loss_scale=1.000, learning_rate=9.238e-05, step_time=0.431 [cnode7-012:0/16] 2024-12-07 03:35:44,667 (deepspeed_trainer:228) INFO: 19epoch:train:11301-11400batch: iter_time=1.111e-04, loss_ctc=65.989, loss_att=52.759, acc=0.706, loss=56.731, grad_norm=3.918, loss_scale=1.000, learning_rate=9.236e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 03:36:22,059 (deepspeed_trainer:228) INFO: 19epoch:train:11401-11500batch: iter_time=1.091e-04, loss_ctc=72.200, loss_att=52.899, acc=0.713, loss=58.667, grad_norm=4.442, loss_scale=1.000, learning_rate=9.234e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 03:36:59,463 (deepspeed_trainer:228) INFO: 19epoch:train:11501-11600batch: iter_time=1.122e-04, loss_ctc=79.365, loss_att=59.429, acc=0.701, loss=65.447, grad_norm=5.212, loss_scale=1.000, learning_rate=9.233e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 03:37:36,493 (deepspeed_trainer:228) INFO: 19epoch:train:11601-11700batch: iter_time=1.126e-04, loss_ctc=73.435, loss_att=56.913, acc=0.700, loss=61.872, grad_norm=4.511, loss_scale=1.000, learning_rate=9.231e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:38:13,736 (deepspeed_trainer:228) INFO: 19epoch:train:11701-11800batch: iter_time=1.081e-04, loss_ctc=77.817, loss_att=59.100, acc=0.706, loss=64.717, grad_norm=4.864, loss_scale=1.000, learning_rate=9.229e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:38:51,119 (deepspeed_trainer:228) INFO: 19epoch:train:11801-11900batch: iter_time=1.146e-04, loss_ctc=85.116, loss_att=66.972, acc=0.693, loss=72.427, grad_norm=5.021, loss_scale=1.000, learning_rate=9.228e-05, step_time=0.373 [2024-12-07 03:39:28,750] [INFO] [logging.py:129:log_dist] [Rank 0] step=282000, skipped=0, lr=[np.float64(9.225295723396042e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:39:28,751] [INFO] [timer.py:264:stop] epoch=0/micro_step=27000/global_step=27000, RunningAvgSamplesPerSec=43.8914240922659, CurrSamplesPerSec=40.58091141707051, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:39:28,753 (deepspeed_trainer:228) INFO: 19epoch:train:11901-12000batch: iter_time=1.163e-04, loss_ctc=76.887, loss_att=63.071, acc=0.708, loss=67.222, grad_norm=4.421, loss_scale=1.000, learning_rate=9.226e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 03:40:06,069 (deepspeed_trainer:228) INFO: 19epoch:train:12001-12100batch: iter_time=1.124e-04, loss_ctc=75.465, loss_att=59.242, acc=0.701, loss=64.083, grad_norm=4.338, loss_scale=1.000, learning_rate=9.224e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 03:40:43,326 (deepspeed_trainer:228) INFO: 19epoch:train:12101-12200batch: iter_time=1.159e-04, loss_ctc=71.776, loss_att=52.751, acc=0.715, loss=58.441, grad_norm=4.502, loss_scale=1.000, learning_rate=9.223e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:41:20,530 (deepspeed_trainer:228) INFO: 19epoch:train:12201-12300batch: iter_time=1.142e-04, loss_ctc=67.450, loss_att=48.166, acc=0.716, loss=53.936, grad_norm=4.965, loss_scale=1.000, learning_rate=9.221e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:41:57,879 (deepspeed_trainer:228) INFO: 19epoch:train:12301-12400batch: iter_time=1.158e-04, loss_ctc=71.921, loss_att=54.664, acc=0.691, loss=59.866, grad_norm=4.643, loss_scale=1.000, learning_rate=9.220e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 03:42:35,038 (deepspeed_trainer:228) INFO: 19epoch:train:12401-12500batch: iter_time=1.171e-04, loss_ctc=69.478, loss_att=51.220, acc=0.711, loss=56.678, grad_norm=4.810, loss_scale=1.000, learning_rate=9.218e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 03:43:12,326 (deepspeed_trainer:228) INFO: 19epoch:train:12501-12600batch: iter_time=1.140e-04, loss_ctc=68.656, loss_att=52.327, acc=0.715, loss=57.250, grad_norm=4.459, loss_scale=1.000, learning_rate=9.216e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:43:49,548 (deepspeed_trainer:228) INFO: 19epoch:train:12601-12700batch: iter_time=1.125e-04, loss_ctc=67.079, loss_att=49.946, acc=0.707, loss=55.082, grad_norm=3.984, loss_scale=1.000, learning_rate=9.215e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:44:27,018 (deepspeed_trainer:228) INFO: 19epoch:train:12701-12800batch: iter_time=1.119e-04, loss_ctc=78.630, loss_att=59.966, acc=0.698, loss=65.548, grad_norm=4.473, loss_scale=1.000, learning_rate=9.213e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 03:45:04,835 (deepspeed_trainer:228) INFO: 19epoch:train:12801-12900batch: iter_time=1.116e-04, loss_ctc=83.283, loss_att=71.737, acc=0.688, loss=75.210, grad_norm=4.774, loss_scale=1.000, learning_rate=9.211e-05, step_time=0.378 [2024-12-07 03:45:42,491] [INFO] [logging.py:129:log_dist] [Rank 0] step=283000, skipped=0, lr=[np.float64(9.208982248038345e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:45:42,491] [INFO] [timer.py:264:stop] epoch=0/micro_step=28000/global_step=28000, RunningAvgSamplesPerSec=43.89343397418506, CurrSamplesPerSec=42.221990284623224, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:45:42,493 (deepspeed_trainer:228) INFO: 19epoch:train:12901-13000batch: iter_time=1.113e-04, loss_ctc=75.991, loss_att=61.113, acc=0.707, loss=65.574, grad_norm=4.529, loss_scale=1.000, learning_rate=9.210e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 03:46:19,673 (deepspeed_trainer:228) INFO: 19epoch:train:13001-13100batch: iter_time=1.090e-04, loss_ctc=80.764, loss_att=71.126, acc=0.692, loss=73.973, grad_norm=5.570, loss_scale=1.000, learning_rate=9.208e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:46:32,668 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 03:46:58,958 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 03:47:14,662 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 03:47:14,662 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 03:47:14,665 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 03:47:41,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:41,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:41,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:42,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:39,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:40,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:43,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:41,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:41,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:44,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:44,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:44,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:41,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:42,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:42,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:47:42,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:28,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:29,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:30,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:28,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:31,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:31,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:29,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:32,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:32,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:30,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:33,522] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:31,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:31,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:31,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:31,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:48:33,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:18,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:19,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:17,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:20,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:20,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:20,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:18,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:21,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:22,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:22,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:20,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:20,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:20,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:21,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:25,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:49:27,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:06,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:04,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:05,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:08,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:08,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:11,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:11,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:11,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:09,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:12,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:09,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:09,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:13,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:12,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:16,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 03:50:20,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 03:51:17,932 (deepspeed_trainer:228) INFO: 19epoch:train:13101-13200batch: iter_time=2.473, loss_ctc=75.172, loss_att=62.528, acc=0.697, loss=66.353, grad_norm=4.247, loss_scale=1.000, learning_rate=9.207e-05, step_time=0.509 [cnode7-012:0/16] 2024-12-07 03:51:54,683 (deepspeed_trainer:228) INFO: 19epoch:train:13201-13300batch: iter_time=1.071e-04, loss_ctc=67.055, loss_att=49.497, acc=0.710, loss=54.766, grad_norm=4.065, loss_scale=1.000, learning_rate=9.205e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 03:52:31,619 (deepspeed_trainer:228) INFO: 19epoch:train:13301-13400batch: iter_time=1.073e-04, loss_ctc=78.231, loss_att=58.332, acc=0.711, loss=64.309, grad_norm=4.601, loss_scale=1.000, learning_rate=9.203e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:53:08,701 (deepspeed_trainer:228) INFO: 19epoch:train:13401-13500batch: iter_time=1.089e-04, loss_ctc=73.549, loss_att=57.063, acc=0.704, loss=61.986, grad_norm=4.219, loss_scale=1.000, learning_rate=9.202e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:53:45,558 (deepspeed_trainer:228) INFO: 19epoch:train:13501-13600batch: iter_time=1.084e-04, loss_ctc=75.797, loss_att=53.620, acc=0.708, loss=60.295, grad_norm=4.321, loss_scale=1.000, learning_rate=9.200e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 03:54:22,651 (deepspeed_trainer:228) INFO: 19epoch:train:13601-13700batch: iter_time=1.098e-04, loss_ctc=73.163, loss_att=57.057, acc=0.703, loss=61.870, grad_norm=4.897, loss_scale=1.000, learning_rate=9.198e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:55:00,229 (deepspeed_trainer:228) INFO: 19epoch:train:13701-13800batch: iter_time=1.092e-04, loss_ctc=82.644, loss_att=63.874, acc=0.700, loss=69.528, grad_norm=4.820, loss_scale=1.000, learning_rate=9.197e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 03:55:37,492 (deepspeed_trainer:228) INFO: 19epoch:train:13801-13900batch: iter_time=1.099e-04, loss_ctc=79.722, loss_att=63.612, acc=0.705, loss=68.445, grad_norm=4.911, loss_scale=1.000, learning_rate=9.195e-05, step_time=0.372 [2024-12-07 03:56:14,702] [INFO] [logging.py:129:log_dist] [Rank 0] step=284000, skipped=0, lr=[np.float64(9.192755011124845e-05)], mom=[[0.9, 0.98]] [2024-12-07 03:56:14,702] [INFO] [timer.py:264:stop] epoch=0/micro_step=29000/global_step=29000, RunningAvgSamplesPerSec=43.84638755525693, CurrSamplesPerSec=45.41012598573382, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 03:56:14,704 (deepspeed_trainer:228) INFO: 19epoch:train:13901-14000batch: iter_time=1.069e-04, loss_ctc=77.943, loss_att=61.808, acc=0.706, loss=66.638, grad_norm=4.394, loss_scale=1.000, learning_rate=9.194e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:56:51,441 (deepspeed_trainer:228) INFO: 19epoch:train:14001-14100batch: iter_time=1.087e-04, loss_ctc=64.329, loss_att=46.822, acc=0.716, loss=52.061, grad_norm=4.179, loss_scale=1.000, learning_rate=9.192e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 03:57:28,457 (deepspeed_trainer:228) INFO: 19epoch:train:14101-14200batch: iter_time=1.088e-04, loss_ctc=77.169, loss_att=54.678, acc=0.706, loss=61.434, grad_norm=4.814, loss_scale=1.000, learning_rate=9.190e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:58:05,361 (deepspeed_trainer:228) INFO: 19epoch:train:14201-14300batch: iter_time=1.088e-04, loss_ctc=67.308, loss_att=55.346, acc=0.692, loss=58.906, grad_norm=4.717, loss_scale=1.000, learning_rate=9.189e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 03:58:42,405 (deepspeed_trainer:228) INFO: 19epoch:train:14301-14400batch: iter_time=1.108e-04, loss_ctc=70.781, loss_att=49.469, acc=0.719, loss=55.877, grad_norm=4.187, loss_scale=1.000, learning_rate=9.187e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 03:59:19,578 (deepspeed_trainer:228) INFO: 19epoch:train:14401-14500batch: iter_time=1.091e-04, loss_ctc=64.190, loss_att=46.001, acc=0.722, loss=51.466, grad_norm=3.974, loss_scale=1.000, learning_rate=9.185e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 03:59:56,605 (deepspeed_trainer:228) INFO: 19epoch:train:14501-14600batch: iter_time=1.072e-04, loss_ctc=70.343, loss_att=54.109, acc=0.697, loss=58.974, grad_norm=4.389, loss_scale=1.000, learning_rate=9.184e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:00:33,711 (deepspeed_trainer:228) INFO: 19epoch:train:14601-14700batch: iter_time=1.080e-04, loss_ctc=79.886, loss_att=63.740, acc=0.691, loss=68.608, grad_norm=4.606, loss_scale=1.000, learning_rate=9.182e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:01:10,941 (deepspeed_trainer:228) INFO: 19epoch:train:14701-14800batch: iter_time=1.097e-04, loss_ctc=79.974, loss_att=65.349, acc=0.699, loss=69.719, grad_norm=4.485, loss_scale=1.000, learning_rate=9.181e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:01:48,174 (deepspeed_trainer:228) INFO: 19epoch:train:14801-14900batch: iter_time=1.084e-04, loss_ctc=75.856, loss_att=64.224, acc=0.707, loss=67.748, grad_norm=4.276, loss_scale=1.000, learning_rate=9.179e-05, step_time=0.372 [2024-12-07 04:02:25,275] [INFO] [logging.py:129:log_dist] [Rank 0] step=285000, skipped=0, lr=[np.float64(9.176613255515091e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:02:25,276] [INFO] [timer.py:264:stop] epoch=0/micro_step=30000/global_step=30000, RunningAvgSamplesPerSec=43.86020555915373, CurrSamplesPerSec=46.88956058133345, MemAllocated=2.04GB, MaxMemAllocated=15.53GB [cnode7-012:0/16] 2024-12-07 04:02:25,277 (deepspeed_trainer:228) INFO: 19epoch:train:14901-15000batch: iter_time=1.099e-04, loss_ctc=82.507, loss_att=74.698, acc=0.682, loss=77.047, grad_norm=5.510, loss_scale=1.000, learning_rate=9.177e-05, step_time=0.371 [2024-12-07 04:02:39,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:39,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:37,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:40,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:40,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:41,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:41,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:41,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:41,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:38,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:54,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:55,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:53,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:56,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:53,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:57,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:54,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:54,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:57,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:57,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:57,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:57,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:55,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:55,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:55,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:02:55,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:10,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:11,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:11,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:08,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:09,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:12,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:13,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:11,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:11,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:14,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:14,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:14,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:11,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:12,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:12,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:12,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:25,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:25,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:26,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:23,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:24,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:28,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:28,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:26,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:29,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:27,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:30,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:27,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:30,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:27,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:28,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:28,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:03:43,316] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 19 is about to be saved! [2024-12-07 04:03:43,346] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/mp_rank_00_model_states.pt [2024-12-07 04:03:43,346] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/mp_rank_00_model_states.pt... [2024-12-07 04:03:45,314] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/mp_rank_00_model_states.pt. [2024-12-07 04:03:42,738] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,738] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,738] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,739] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,739] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,472] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,742] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,474] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,474] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,743] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 04:03:45,478] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 04:03:42,746] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 04:03:46,212] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,212] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,212] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,232] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,232] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,232] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,258] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,258] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,258] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,536] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,536] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,536] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,272] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,272] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,272] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,542] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,542] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,542] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,543] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,543] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,543] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,548] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,548] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,548] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,280] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,281] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,281] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,549] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,549] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,549] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,283] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,283] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,284] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,554] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,555] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,555] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,301] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,301] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,301] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:46,301] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 04:03:46,307] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 04:03:46,307] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,590] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,590] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,590] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [2024-12-07 04:03:43,594] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 04:03:43,594] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_19/19/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 04:03:43,594] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 19 is ready now! [cnode7-012:0/16] 2024-12-07 04:03:46,329 (deepspeed_trainer:158) INFO: 19epoch results: [train] iter_time=0.132, loss_ctc=75.525, loss_att=58.113, acc=0.699, loss=63.338, grad_norm=4.657, loss_scale=1.000, learning_rate=9.301e-05, step_time=0.375, time=2 hours, 6 minutes and 57.97 seconds, total_count=285019, gpu_max_cached_mem_GB=27.982, [valid] loss_ctc=4.531, cer_ctc=0.110, loss_att=7.531, acc=0.787, cer=0.433, wer=1.000, loss=6.625, time=1 minute and 9.42 seconds, total_count=19, gpu_max_cached_mem_GB=27.982 [cnode7-012:0/16] 2024-12-07 04:03:48,112 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 04:04:14,473 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 04:04:30,286 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 04:04:30,286 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 04:04:30,288 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 04:04:47,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:51,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:48,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:51,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:49,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:52,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:49,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:52,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:49,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:50,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:52,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:53,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:50,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:50,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:53,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:04:54,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:37,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:35,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:35,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:38,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:39,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:36,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:37,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:40,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:37,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:40,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:37,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:40,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:38,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:38,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:42,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:05:43,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:22,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:23,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:23,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:27,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:27,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:25,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:28,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:25,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:25,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:29,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:29,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:29,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:29,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:27,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:30,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:06:29,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:10,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:14,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:12,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:12,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:13,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:14,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:17,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:14,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:17,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:18,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:18,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:18,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:19,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:19,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:18,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:07:19,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 04:08:26,346 (deepspeed_trainer:228) INFO: 20epoch:train:1-100batch: iter_time=2.377, loss_ctc=77.595, loss_att=55.262, acc=0.705, loss=61.945, grad_norm=4.813, loss_scale=1.000, learning_rate=9.176e-05, step_time=0.406 [cnode7-012:0/16] 2024-12-07 04:09:03,726 (deepspeed_trainer:228) INFO: 20epoch:train:101-200batch: iter_time=1.028e-04, loss_ctc=82.673, loss_att=64.522, acc=0.696, loss=69.961, grad_norm=4.825, loss_scale=1.000, learning_rate=9.174e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 04:09:40,908 (deepspeed_trainer:228) INFO: 20epoch:train:201-300batch: iter_time=1.035e-04, loss_ctc=71.784, loss_att=54.079, acc=0.717, loss=59.385, grad_norm=4.273, loss_scale=1.000, learning_rate=9.173e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:10:17,918 (deepspeed_trainer:228) INFO: 20epoch:train:301-400batch: iter_time=1.079e-04, loss_ctc=64.069, loss_att=48.048, acc=0.703, loss=52.873, grad_norm=4.506, loss_scale=1.000, learning_rate=9.171e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:10:55,518 (deepspeed_trainer:228) INFO: 20epoch:train:401-500batch: iter_time=1.064e-04, loss_ctc=76.911, loss_att=55.519, acc=0.701, loss=61.938, grad_norm=5.114, loss_scale=1.000, learning_rate=9.169e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 04:11:33,252 (deepspeed_trainer:228) INFO: 20epoch:train:501-600batch: iter_time=1.074e-04, loss_ctc=80.919, loss_att=63.517, acc=0.697, loss=68.687, grad_norm=4.761, loss_scale=1.000, learning_rate=9.168e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 04:12:10,803 (deepspeed_trainer:228) INFO: 20epoch:train:601-700batch: iter_time=1.060e-04, loss_ctc=80.422, loss_att=58.454, acc=0.706, loss=65.050, grad_norm=5.917, loss_scale=1.000, learning_rate=9.166e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 04:12:48,458 (deepspeed_trainer:228) INFO: 20epoch:train:701-800batch: iter_time=1.048e-04, loss_ctc=85.473, loss_att=66.601, acc=0.699, loss=72.267, grad_norm=4.940, loss_scale=1.000, learning_rate=9.165e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 04:13:25,962 (deepspeed_trainer:228) INFO: 20epoch:train:801-900batch: iter_time=1.056e-04, loss_ctc=74.426, loss_att=59.302, acc=0.696, loss=63.812, grad_norm=4.860, loss_scale=1.000, learning_rate=9.163e-05, step_time=0.375 [2024-12-07 04:14:03,606] [INFO] [logging.py:129:log_dist] [Rank 0] step=286000, skipped=0, lr=[np.float64(9.16055623334244e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:14:03,607] [INFO] [timer.py:264:stop] epoch=0/micro_step=31000/global_step=31000, RunningAvgSamplesPerSec=43.84504611231368, CurrSamplesPerSec=45.701154649790695, MemAllocated=2.04GB, MaxMemAllocated=15.54GB [cnode7-012:0/16] 2024-12-07 04:14:03,609 (deepspeed_trainer:228) INFO: 20epoch:train:901-1000batch: iter_time=1.064e-04, loss_ctc=71.698, loss_att=59.756, acc=0.699, loss=63.339, grad_norm=4.733, loss_scale=1.000, learning_rate=9.161e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 04:14:41,473 (deepspeed_trainer:228) INFO: 20epoch:train:1001-1100batch: iter_time=1.101e-04, loss_ctc=76.974, loss_att=57.015, acc=0.691, loss=63.003, grad_norm=5.029, loss_scale=1.000, learning_rate=9.160e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 04:15:19,366 (deepspeed_trainer:228) INFO: 20epoch:train:1101-1200batch: iter_time=1.110e-04, loss_ctc=77.356, loss_att=64.072, acc=0.688, loss=68.057, grad_norm=4.455, loss_scale=1.000, learning_rate=9.158e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 04:15:56,651 (deepspeed_trainer:228) INFO: 20epoch:train:1201-1300batch: iter_time=1.070e-04, loss_ctc=72.142, loss_att=57.984, acc=0.693, loss=62.239, grad_norm=4.127, loss_scale=1.000, learning_rate=9.157e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 04:16:33,743 (deepspeed_trainer:228) INFO: 20epoch:train:1301-1400batch: iter_time=1.127e-04, loss_ctc=67.961, loss_att=51.609, acc=0.705, loss=56.503, grad_norm=4.350, loss_scale=1.000, learning_rate=9.155e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:17:11,003 (deepspeed_trainer:228) INFO: 20epoch:train:1401-1500batch: iter_time=1.050e-04, loss_ctc=86.538, loss_att=61.182, acc=0.700, loss=68.784, grad_norm=5.255, loss_scale=1.000, learning_rate=9.153e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:17:49,043 (deepspeed_trainer:228) INFO: 20epoch:train:1501-1600batch: iter_time=1.118e-04, loss_ctc=80.513, loss_att=62.887, acc=0.678, loss=68.157, grad_norm=5.744, loss_scale=1.000, learning_rate=9.152e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 04:18:26,079 (deepspeed_trainer:228) INFO: 20epoch:train:1601-1700batch: iter_time=1.064e-04, loss_ctc=70.293, loss_att=54.848, acc=0.692, loss=59.512, grad_norm=4.319, loss_scale=1.000, learning_rate=9.150e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:19:04,401 (deepspeed_trainer:228) INFO: 20epoch:train:1701-1800batch: iter_time=1.076e-04, loss_ctc=78.962, loss_att=64.931, acc=0.695, loss=69.110, grad_norm=5.153, loss_scale=1.000, learning_rate=9.149e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 04:19:37,313 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 04:20:04,244 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 04:20:21,232 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 04:20:21,232 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 04:20:21,235 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 04:20:44,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:45,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:45,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:45,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:46,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:46,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:47,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:47,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:20:52,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:33,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:33,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:34,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:35,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:35,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:35,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:36,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:38,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:41,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:42,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:43,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:44,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:45,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:45,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:45,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:21:45,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:22,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:22,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:22,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:23,383] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:23,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:26,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:31,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:29,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:32,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:33,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:34,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:31,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:34,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:35,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:35,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:22:38,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:09,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:10,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:10,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:11,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:11,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:18,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:16,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:18,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:22,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:22,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:22,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:23,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:23,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:21,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:26,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:23:27,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 04:24:04,556 (deepspeed_trainer:228) INFO: 20epoch:train:1801-1900batch: iter_time=2.582, loss_ctc=70.076, loss_att=61.336, acc=0.687, loss=63.944, grad_norm=4.220, loss_scale=1.000, learning_rate=9.147e-05, step_time=0.420 [2024-12-07 04:24:42,480] [INFO] [logging.py:129:log_dist] [Rank 0] step=287000, skipped=0, lr=[np.float64(9.144583205868502e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:24:42,483] [INFO] [timer.py:264:stop] epoch=0/micro_step=32000/global_step=32000, RunningAvgSamplesPerSec=43.82030857035735, CurrSamplesPerSec=43.989653276589856, MemAllocated=2.04GB, MaxMemAllocated=15.54GB [cnode7-012:0/16] 2024-12-07 04:24:42,485 (deepspeed_trainer:228) INFO: 20epoch:train:1901-2000batch: iter_time=1.073e-04, loss_ctc=81.551, loss_att=56.275, acc=0.704, loss=63.913, grad_norm=5.048, loss_scale=1.000, learning_rate=9.145e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 04:25:19,911 (deepspeed_trainer:228) INFO: 20epoch:train:2001-2100batch: iter_time=1.060e-04, loss_ctc=78.306, loss_att=63.053, acc=0.694, loss=67.657, grad_norm=4.504, loss_scale=1.000, learning_rate=9.144e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 04:25:57,015 (deepspeed_trainer:228) INFO: 20epoch:train:2101-2200batch: iter_time=1.069e-04, loss_ctc=69.532, loss_att=53.952, acc=0.712, loss=58.622, grad_norm=4.277, loss_scale=1.000, learning_rate=9.142e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:26:33,983 (deepspeed_trainer:228) INFO: 20epoch:train:2201-2300batch: iter_time=1.069e-04, loss_ctc=72.949, loss_att=53.281, acc=0.689, loss=59.195, grad_norm=5.436, loss_scale=1.000, learning_rate=9.141e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:27:11,215 (deepspeed_trainer:228) INFO: 20epoch:train:2301-2400batch: iter_time=1.079e-04, loss_ctc=70.091, loss_att=50.819, acc=0.714, loss=56.599, grad_norm=4.584, loss_scale=1.000, learning_rate=9.139e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:27:48,178 (deepspeed_trainer:228) INFO: 20epoch:train:2401-2500batch: iter_time=1.064e-04, loss_ctc=78.226, loss_att=60.376, acc=0.700, loss=65.705, grad_norm=4.649, loss_scale=1.000, learning_rate=9.137e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:28:25,076 (deepspeed_trainer:228) INFO: 20epoch:train:2501-2600batch: iter_time=1.089e-04, loss_ctc=84.139, loss_att=64.988, acc=0.696, loss=70.740, grad_norm=5.243, loss_scale=1.000, learning_rate=9.136e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:29:02,034 (deepspeed_trainer:228) INFO: 20epoch:train:2601-2700batch: iter_time=1.058e-04, loss_ctc=81.810, loss_att=62.600, acc=0.690, loss=68.377, grad_norm=4.729, loss_scale=1.000, learning_rate=9.134e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:29:39,110 (deepspeed_trainer:228) INFO: 20epoch:train:2701-2800batch: iter_time=1.072e-04, loss_ctc=69.263, loss_att=53.006, acc=0.702, loss=57.888, grad_norm=4.364, loss_scale=1.000, learning_rate=9.133e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:30:16,374 (deepspeed_trainer:228) INFO: 20epoch:train:2801-2900batch: iter_time=1.059e-04, loss_ctc=76.872, loss_att=61.050, acc=0.696, loss=65.811, grad_norm=4.662, loss_scale=1.000, learning_rate=9.131e-05, step_time=0.372 [2024-12-07 04:30:53,226] [INFO] [logging.py:129:log_dist] [Rank 0] step=288000, skipped=0, lr=[np.float64(9.128693443340409e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:30:53,226] [INFO] [timer.py:264:stop] epoch=0/micro_step=33000/global_step=33000, RunningAvgSamplesPerSec=43.8339337528805, CurrSamplesPerSec=43.95648911881708, MemAllocated=2.04GB, MaxMemAllocated=15.54GB [cnode7-012:0/16] 2024-12-07 04:30:53,228 (deepspeed_trainer:228) INFO: 20epoch:train:2901-3000batch: iter_time=1.057e-04, loss_ctc=74.045, loss_att=55.387, acc=0.697, loss=60.956, grad_norm=5.381, loss_scale=1.000, learning_rate=9.129e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 04:31:30,336 (deepspeed_trainer:228) INFO: 20epoch:train:3001-3100batch: iter_time=1.066e-04, loss_ctc=77.089, loss_att=62.631, acc=0.680, loss=66.968, grad_norm=4.532, loss_scale=1.000, learning_rate=9.128e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:32:07,510 (deepspeed_trainer:228) INFO: 20epoch:train:3101-3200batch: iter_time=1.047e-04, loss_ctc=75.725, loss_att=57.388, acc=0.686, loss=62.890, grad_norm=4.717, loss_scale=1.000, learning_rate=9.126e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:32:44,455 (deepspeed_trainer:228) INFO: 20epoch:train:3201-3300batch: iter_time=1.063e-04, loss_ctc=64.438, loss_att=49.022, acc=0.712, loss=53.666, grad_norm=3.946, loss_scale=1.000, learning_rate=9.125e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:33:21,328 (deepspeed_trainer:228) INFO: 20epoch:train:3301-3400batch: iter_time=1.070e-04, loss_ctc=85.315, loss_att=59.428, acc=0.693, loss=67.212, grad_norm=4.793, loss_scale=1.000, learning_rate=9.123e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 04:33:58,452 (deepspeed_trainer:228) INFO: 20epoch:train:3401-3500batch: iter_time=1.062e-04, loss_ctc=76.086, loss_att=63.592, acc=0.677, loss=67.318, grad_norm=5.485, loss_scale=1.000, learning_rate=9.122e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:34:35,191 (deepspeed_trainer:228) INFO: 20epoch:train:3501-3600batch: iter_time=1.066e-04, loss_ctc=71.932, loss_att=53.981, acc=0.691, loss=59.378, grad_norm=4.477, loss_scale=1.000, learning_rate=9.120e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 04:35:12,453 (deepspeed_trainer:228) INFO: 20epoch:train:3601-3700batch: iter_time=1.071e-04, loss_ctc=79.876, loss_att=68.531, acc=0.680, loss=71.949, grad_norm=4.805, loss_scale=1.000, learning_rate=9.118e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:35:34,833 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 04:36:02,116 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 04:36:18,700 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 04:36:18,700 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 04:36:18,702 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 04:36:42,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:42,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:43,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:40,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:40,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:40,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:41,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:42,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:42,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:42,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:43,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:46,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:47,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:48,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:48,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:36:48,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:30,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:30,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:30,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:28,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:29,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:30,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:30,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:31,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:31,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:31,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:31,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:35,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:36,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:37,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:37,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:37:37,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:17,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:17,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:15,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:20,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:18,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:18,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:18,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:19,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:19,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:20,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:20,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:24,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:24,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:25,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:26,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:38:26,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:04,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:04,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:03,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:05,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:05,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:06,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:06,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:07,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:10,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:08,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:08,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:12,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:12,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:14,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:14,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:39:14,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 04:40:03,559 (deepspeed_trainer:228) INFO: 20epoch:train:3701-3800batch: iter_time=2.540, loss_ctc=70.693, loss_att=56.585, acc=0.693, loss=60.824, grad_norm=4.325, loss_scale=1.000, learning_rate=9.117e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:40:40,715 (deepspeed_trainer:228) INFO: 20epoch:train:3801-3900batch: iter_time=1.042e-04, loss_ctc=80.176, loss_att=53.501, acc=0.709, loss=61.539, grad_norm=5.012, loss_scale=1.000, learning_rate=9.115e-05, step_time=0.371 [2024-12-07 04:41:17,922] [INFO] [logging.py:129:log_dist] [Rank 0] step=289000, skipped=0, lr=[np.float64(9.11288622485077e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:41:17,922] [INFO] [timer.py:264:stop] epoch=0/micro_step=34000/global_step=34000, RunningAvgSamplesPerSec=43.84708523327153, CurrSamplesPerSec=43.503674665961796, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 04:41:17,924 (deepspeed_trainer:228) INFO: 20epoch:train:3901-4000batch: iter_time=1.076e-04, loss_ctc=73.677, loss_att=59.672, acc=0.695, loss=63.873, grad_norm=4.630, loss_scale=1.000, learning_rate=9.114e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:41:55,146 (deepspeed_trainer:228) INFO: 20epoch:train:4001-4100batch: iter_time=1.071e-04, loss_ctc=73.823, loss_att=57.279, acc=0.713, loss=62.252, grad_norm=4.271, loss_scale=1.000, learning_rate=9.112e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 04:42:32,071 (deepspeed_trainer:228) INFO: 20epoch:train:4101-4200batch: iter_time=1.057e-04, loss_ctc=68.918, loss_att=48.481, acc=0.707, loss=54.603, grad_norm=4.766, loss_scale=1.000, learning_rate=9.111e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:43:09,120 (deepspeed_trainer:228) INFO: 20epoch:train:4201-4300batch: iter_time=1.054e-04, loss_ctc=70.892, loss_att=53.836, acc=0.708, loss=58.944, grad_norm=4.411, loss_scale=1.000, learning_rate=9.109e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:43:46,178 (deepspeed_trainer:228) INFO: 20epoch:train:4301-4400batch: iter_time=1.053e-04, loss_ctc=72.082, loss_att=52.719, acc=0.710, loss=58.536, grad_norm=4.559, loss_scale=1.000, learning_rate=9.107e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:44:23,527 (deepspeed_trainer:228) INFO: 20epoch:train:4401-4500batch: iter_time=1.057e-04, loss_ctc=86.965, loss_att=66.928, acc=0.688, loss=72.924, grad_norm=5.201, loss_scale=1.000, learning_rate=9.106e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 04:45:00,902 (deepspeed_trainer:228) INFO: 20epoch:train:4501-4600batch: iter_time=1.061e-04, loss_ctc=85.073, loss_att=65.665, acc=0.691, loss=71.490, grad_norm=5.107, loss_scale=1.000, learning_rate=9.104e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 04:45:37,484 (deepspeed_trainer:228) INFO: 20epoch:train:4601-4700batch: iter_time=1.058e-04, loss_ctc=67.074, loss_att=50.239, acc=0.702, loss=55.297, grad_norm=4.680, loss_scale=1.000, learning_rate=9.103e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 04:46:14,397 (deepspeed_trainer:228) INFO: 20epoch:train:4701-4800batch: iter_time=1.078e-04, loss_ctc=77.380, loss_att=62.149, acc=0.696, loss=66.720, grad_norm=4.977, loss_scale=1.000, learning_rate=9.101e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:46:50,882 (deepspeed_trainer:228) INFO: 20epoch:train:4801-4900batch: iter_time=1.057e-04, loss_ctc=72.373, loss_att=54.239, acc=0.693, loss=59.682, grad_norm=4.461, loss_scale=1.000, learning_rate=9.100e-05, step_time=0.364 [2024-12-07 04:47:27,735] [INFO] [logging.py:129:log_dist] [Rank 0] step=290000, skipped=0, lr=[np.float64(9.097160838200297e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:47:27,736] [INFO] [timer.py:264:stop] epoch=0/micro_step=35000/global_step=35000, RunningAvgSamplesPerSec=43.862444479833506, CurrSamplesPerSec=44.970505159232864, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 04:47:27,738 (deepspeed_trainer:228) INFO: 20epoch:train:4901-5000batch: iter_time=1.071e-04, loss_ctc=73.769, loss_att=58.680, acc=0.691, loss=63.231, grad_norm=4.479, loss_scale=1.000, learning_rate=9.098e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 04:48:04,451 (deepspeed_trainer:228) INFO: 20epoch:train:5001-5100batch: iter_time=1.059e-04, loss_ctc=71.393, loss_att=54.165, acc=0.686, loss=59.367, grad_norm=4.889, loss_scale=1.000, learning_rate=9.096e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 04:48:41,336 (deepspeed_trainer:228) INFO: 20epoch:train:5101-5200batch: iter_time=1.080e-04, loss_ctc=75.924, loss_att=55.992, acc=0.715, loss=61.971, grad_norm=4.268, loss_scale=1.000, learning_rate=9.095e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:49:17,963 (deepspeed_trainer:228) INFO: 20epoch:train:5201-5300batch: iter_time=1.084e-04, loss_ctc=80.232, loss_att=57.539, acc=0.685, loss=64.347, grad_norm=4.758, loss_scale=1.000, learning_rate=9.093e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 04:49:54,937 (deepspeed_trainer:228) INFO: 20epoch:train:5301-5400batch: iter_time=1.105e-04, loss_ctc=73.618, loss_att=61.194, acc=0.686, loss=64.963, grad_norm=4.833, loss_scale=1.000, learning_rate=9.092e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 04:50:31,777 (deepspeed_trainer:228) INFO: 20epoch:train:5401-5500batch: iter_time=1.059e-04, loss_ctc=72.778, loss_att=54.608, acc=0.692, loss=60.047, grad_norm=4.521, loss_scale=1.000, learning_rate=9.090e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 04:51:08,757 (deepspeed_trainer:228) INFO: 20epoch:train:5501-5600batch: iter_time=1.055e-04, loss_ctc=77.440, loss_att=67.669, acc=0.684, loss=70.588, grad_norm=4.833, loss_scale=1.000, learning_rate=9.089e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 04:51:21,679 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 04:51:48,495 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 04:52:04,980 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 04:52:04,980 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 04:52:04,983 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 04:52:30,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:31,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:32,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:29,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:33,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:33,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:30,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:33,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:33,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:52:34,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:19,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:19,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:17,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:19,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:19,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:22,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:23,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:19,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:20,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:23,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:20,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:20,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:20,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:23,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:25,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:53:25,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:05,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:08,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:08,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:08,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:08,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:08,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:09,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:09,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:09,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:12,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:10,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:13,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:14,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:16,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:17,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:17,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:55,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:52,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:56,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:56,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:57,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:57,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:57,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:58,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:58,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:54:58,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:55:02,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:55:03,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:55:03,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:55:07,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:55:11,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 04:55:11,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 04:56:08,947 (deepspeed_trainer:228) INFO: 20epoch:train:5601-5700batch: iter_time=2.626, loss_ctc=69.083, loss_att=53.039, acc=0.703, loss=57.871, grad_norm=4.150, loss_scale=1.000, learning_rate=9.087e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 04:56:47,324 (deepspeed_trainer:228) INFO: 20epoch:train:5701-5800batch: iter_time=1.237e-04, loss_ctc=87.809, loss_att=65.430, acc=0.705, loss=72.176, grad_norm=4.876, loss_scale=1.000, learning_rate=9.085e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 04:57:24,385 (deepspeed_trainer:228) INFO: 20epoch:train:5801-5900batch: iter_time=1.117e-04, loss_ctc=69.402, loss_att=53.751, acc=0.710, loss=58.466, grad_norm=4.414, loss_scale=1.000, learning_rate=9.084e-05, step_time=0.370 [2024-12-07 04:58:02,193] [INFO] [logging.py:129:log_dist] [Rank 0] step=291000, skipped=0, lr=[np.float64(9.081516579763024e-05)], mom=[[0.9, 0.98]] [2024-12-07 04:58:02,194] [INFO] [timer.py:264:stop] epoch=0/micro_step=36000/global_step=36000, RunningAvgSamplesPerSec=43.870530198628146, CurrSamplesPerSec=44.594068333016025, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 04:58:02,195 (deepspeed_trainer:228) INFO: 20epoch:train:5901-6000batch: iter_time=1.180e-04, loss_ctc=66.480, loss_att=51.888, acc=0.720, loss=56.266, grad_norm=4.154, loss_scale=1.000, learning_rate=9.082e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 04:58:39,975 (deepspeed_trainer:228) INFO: 20epoch:train:6001-6100batch: iter_time=1.128e-04, loss_ctc=70.268, loss_att=52.428, acc=0.706, loss=57.768, grad_norm=4.856, loss_scale=1.000, learning_rate=9.081e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 04:59:17,140 (deepspeed_trainer:228) INFO: 20epoch:train:6101-6200batch: iter_time=1.118e-04, loss_ctc=75.286, loss_att=59.805, acc=0.702, loss=64.421, grad_norm=4.425, loss_scale=1.000, learning_rate=9.079e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 04:59:54,209 (deepspeed_trainer:228) INFO: 20epoch:train:6201-6300batch: iter_time=1.121e-04, loss_ctc=72.556, loss_att=52.424, acc=0.716, loss=58.476, grad_norm=4.626, loss_scale=1.000, learning_rate=9.078e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:00:31,723 (deepspeed_trainer:228) INFO: 20epoch:train:6301-6400batch: iter_time=1.141e-04, loss_ctc=87.653, loss_att=67.460, acc=0.695, loss=73.522, grad_norm=5.435, loss_scale=1.000, learning_rate=9.076e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 05:01:09,349 (deepspeed_trainer:228) INFO: 20epoch:train:6401-6500batch: iter_time=1.137e-04, loss_ctc=82.871, loss_att=63.755, acc=0.710, loss=69.491, grad_norm=4.998, loss_scale=1.000, learning_rate=9.074e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 05:01:46,108 (deepspeed_trainer:228) INFO: 20epoch:train:6501-6600batch: iter_time=1.092e-04, loss_ctc=63.698, loss_att=51.693, acc=0.697, loss=55.304, grad_norm=4.446, loss_scale=1.000, learning_rate=9.073e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 05:02:23,402 (deepspeed_trainer:228) INFO: 20epoch:train:6601-6700batch: iter_time=1.105e-04, loss_ctc=72.603, loss_att=59.369, acc=0.701, loss=63.362, grad_norm=5.121, loss_scale=1.000, learning_rate=9.071e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 05:03:00,679 (deepspeed_trainer:228) INFO: 20epoch:train:6701-6800batch: iter_time=1.112e-04, loss_ctc=77.691, loss_att=63.919, acc=0.689, loss=68.062, grad_norm=4.555, loss_scale=1.000, learning_rate=9.070e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 05:03:37,915 (deepspeed_trainer:228) INFO: 20epoch:train:6801-6900batch: iter_time=1.113e-04, loss_ctc=74.289, loss_att=60.077, acc=0.697, loss=64.310, grad_norm=4.520, loss_scale=1.000, learning_rate=9.068e-05, step_time=0.372 [2024-12-07 05:04:14,950] [INFO] [logging.py:129:log_dist] [Rank 0] step=292000, skipped=0, lr=[np.float64(9.06595275435408e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:04:14,951] [INFO] [timer.py:264:stop] epoch=0/micro_step=37000/global_step=37000, RunningAvgSamplesPerSec=43.87543804392528, CurrSamplesPerSec=46.158360082101524, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:04:14,953 (deepspeed_trainer:228) INFO: 20epoch:train:6901-7000batch: iter_time=1.148e-04, loss_ctc=65.982, loss_att=48.398, acc=0.706, loss=53.650, grad_norm=4.306, loss_scale=1.000, learning_rate=9.067e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:04:52,196 (deepspeed_trainer:228) INFO: 20epoch:train:7001-7100batch: iter_time=1.109e-04, loss_ctc=80.228, loss_att=59.630, acc=0.712, loss=65.787, grad_norm=4.615, loss_scale=1.000, learning_rate=9.065e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 05:05:29,213 (deepspeed_trainer:228) INFO: 20epoch:train:7101-7200batch: iter_time=1.122e-04, loss_ctc=81.716, loss_att=59.830, acc=0.686, loss=66.386, grad_norm=5.448, loss_scale=1.000, learning_rate=9.064e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:06:06,309 (deepspeed_trainer:228) INFO: 20epoch:train:7201-7300batch: iter_time=1.107e-04, loss_ctc=70.696, loss_att=58.826, acc=0.694, loss=62.427, grad_norm=4.613, loss_scale=1.000, learning_rate=9.062e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:06:43,553 (deepspeed_trainer:228) INFO: 20epoch:train:7301-7400batch: iter_time=1.115e-04, loss_ctc=76.812, loss_att=62.987, acc=0.693, loss=67.160, grad_norm=4.782, loss_scale=1.000, learning_rate=9.061e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 05:07:20,947 (deepspeed_trainer:228) INFO: 20epoch:train:7401-7500batch: iter_time=1.114e-04, loss_ctc=72.518, loss_att=63.330, acc=0.688, loss=66.075, grad_norm=4.430, loss_scale=1.000, learning_rate=9.059e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 05:07:24,917 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 05:07:52,194 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 05:08:09,128 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 05:08:09,128 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 05:08:09,130 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 05:08:32,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:33,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:30,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:33,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:31,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:31,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:35,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:35,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:32,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:33,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:33,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:33,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:33,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:36,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:37,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:08:38,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:20,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:21,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:22,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:19,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:19,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:20,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:20,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:21,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:21,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:25,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:22,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:22,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:25,533] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:25,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:26,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:09:30,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:09,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:06,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:10,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:10,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:08,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:08,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:08,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:09,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:09,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:10,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:15,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:15,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:15,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:16,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:13,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:22,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:56,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:53,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:56,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:58,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:56,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:57,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:57,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:57,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:10:57,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:00,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:04,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:04,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:04,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:03,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:07,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:11:15,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 05:12:20,166 (deepspeed_trainer:228) INFO: 20epoch:train:7501-7600batch: iter_time=2.563, loss_ctc=76.552, loss_att=54.038, acc=0.712, loss=60.800, grad_norm=4.507, loss_scale=1.000, learning_rate=9.057e-05, step_time=0.429 [cnode7-012:0/16] 2024-12-07 05:12:57,145 (deepspeed_trainer:228) INFO: 20epoch:train:7601-7700batch: iter_time=1.048e-04, loss_ctc=80.113, loss_att=62.561, acc=0.705, loss=67.825, grad_norm=4.731, loss_scale=1.000, learning_rate=9.056e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 05:13:34,567 (deepspeed_trainer:228) INFO: 20epoch:train:7701-7800batch: iter_time=1.097e-04, loss_ctc=70.553, loss_att=53.383, acc=0.722, loss=58.523, grad_norm=4.691, loss_scale=1.000, learning_rate=9.054e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 05:14:11,908 (deepspeed_trainer:228) INFO: 20epoch:train:7801-7900batch: iter_time=1.086e-04, loss_ctc=61.543, loss_att=47.000, acc=0.708, loss=51.331, grad_norm=4.468, loss_scale=1.000, learning_rate=9.053e-05, step_time=0.373 [2024-12-07 05:14:48,952] [INFO] [logging.py:129:log_dist] [Rank 0] step=293000, skipped=0, lr=[np.float64(9.050468675099929e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:14:48,953] [INFO] [timer.py:264:stop] epoch=0/micro_step=38000/global_step=38000, RunningAvgSamplesPerSec=43.863438484995406, CurrSamplesPerSec=45.166542418278624, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:14:48,954 (deepspeed_trainer:228) INFO: 20epoch:train:7901-8000batch: iter_time=1.074e-04, loss_ctc=74.243, loss_att=53.850, acc=0.709, loss=60.011, grad_norm=4.486, loss_scale=1.000, learning_rate=9.051e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:15:26,200 (deepspeed_trainer:228) INFO: 20epoch:train:8001-8100batch: iter_time=1.095e-04, loss_ctc=78.071, loss_att=61.338, acc=0.705, loss=66.355, grad_norm=4.667, loss_scale=1.000, learning_rate=9.050e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 05:16:03,309 (deepspeed_trainer:228) INFO: 20epoch:train:8101-8200batch: iter_time=1.080e-04, loss_ctc=79.526, loss_att=57.930, acc=0.710, loss=64.412, grad_norm=5.180, loss_scale=1.000, learning_rate=9.048e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:16:40,653 (deepspeed_trainer:228) INFO: 20epoch:train:8201-8300batch: iter_time=1.083e-04, loss_ctc=84.523, loss_att=65.157, acc=0.704, loss=70.964, grad_norm=5.204, loss_scale=1.000, learning_rate=9.047e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 05:17:17,733 (deepspeed_trainer:228) INFO: 20epoch:train:8301-8400batch: iter_time=1.083e-04, loss_ctc=73.025, loss_att=58.377, acc=0.701, loss=62.784, grad_norm=4.603, loss_scale=1.000, learning_rate=9.045e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:17:54,891 (deepspeed_trainer:228) INFO: 20epoch:train:8401-8500batch: iter_time=1.090e-04, loss_ctc=70.162, loss_att=59.230, acc=0.705, loss=62.503, grad_norm=4.108, loss_scale=1.000, learning_rate=9.044e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:18:31,905 (deepspeed_trainer:228) INFO: 20epoch:train:8501-8600batch: iter_time=1.111e-04, loss_ctc=75.238, loss_att=56.097, acc=0.697, loss=61.847, grad_norm=4.964, loss_scale=1.000, learning_rate=9.042e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:19:09,639 (deepspeed_trainer:228) INFO: 20epoch:train:8601-8700batch: iter_time=1.073e-04, loss_ctc=76.548, loss_att=63.614, acc=0.693, loss=67.487, grad_norm=4.908, loss_scale=1.000, learning_rate=9.040e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 05:19:46,981 (deepspeed_trainer:228) INFO: 20epoch:train:8701-8800batch: iter_time=1.064e-04, loss_ctc=71.176, loss_att=57.165, acc=0.697, loss=61.376, grad_norm=4.450, loss_scale=1.000, learning_rate=9.039e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 05:20:23,840 (deepspeed_trainer:228) INFO: 20epoch:train:8801-8900batch: iter_time=1.033e-04, loss_ctc=67.271, loss_att=51.244, acc=0.707, loss=56.038, grad_norm=4.183, loss_scale=1.000, learning_rate=9.037e-05, step_time=0.369 [2024-12-07 05:21:00,741] [INFO] [logging.py:129:log_dist] [Rank 0] step=294000, skipped=0, lr=[np.float64(9.035063663311049e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:21:00,742] [INFO] [timer.py:264:stop] epoch=0/micro_step=39000/global_step=39000, RunningAvgSamplesPerSec=43.870381270071604, CurrSamplesPerSec=46.15493151739742, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:21:00,743 (deepspeed_trainer:228) INFO: 20epoch:train:8901-9000batch: iter_time=1.074e-04, loss_ctc=83.765, loss_att=59.967, acc=0.706, loss=67.100, grad_norm=5.030, loss_scale=1.000, learning_rate=9.036e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:21:37,857 (deepspeed_trainer:228) INFO: 20epoch:train:9001-9100batch: iter_time=1.072e-04, loss_ctc=77.405, loss_att=60.794, acc=0.687, loss=65.803, grad_norm=5.060, loss_scale=1.000, learning_rate=9.034e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:22:14,426 (deepspeed_trainer:228) INFO: 20epoch:train:9101-9200batch: iter_time=1.120e-04, loss_ctc=68.956, loss_att=53.494, acc=0.699, loss=58.132, grad_norm=4.303, loss_scale=1.000, learning_rate=9.033e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 05:22:51,221 (deepspeed_trainer:228) INFO: 20epoch:train:9201-9300batch: iter_time=1.156e-04, loss_ctc=76.998, loss_att=63.142, acc=0.701, loss=67.343, grad_norm=4.483, loss_scale=1.000, learning_rate=9.031e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 05:23:22,763 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 05:23:50,791 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 05:24:07,632 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 05:24:07,632 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 05:24:07,634 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 05:24:31,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:32,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:30,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:33,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:31,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:31,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:31,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:34,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:32,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:32,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:32,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:32,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:35,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:35,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:35,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:24:36,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:20,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:20,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:20,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:21,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:24,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:24,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:25,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:25,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:25,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:25:25,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:10,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:11,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:09,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:11,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:09,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:09,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:10,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:10,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:10,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:10,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:14,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:15,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:16,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:16,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:16,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:17,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:58,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:58,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:59,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:57,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:58,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:58,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:58,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:59,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:59,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:26:59,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:27:03,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:27:03,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:27:03,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:27:04,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:27:05,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:27:06,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 05:27:43,810 (deepspeed_trainer:228) INFO: 20epoch:train:9301-9400batch: iter_time=2.527, loss_ctc=69.135, loss_att=60.562, acc=0.691, loss=63.141, grad_norm=4.357, loss_scale=1.000, learning_rate=9.030e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-07 05:28:21,921 (deepspeed_trainer:228) INFO: 20epoch:train:9401-9500batch: iter_time=1.206e-04, loss_ctc=80.660, loss_att=55.470, acc=0.709, loss=63.003, grad_norm=5.135, loss_scale=1.000, learning_rate=9.028e-05, step_time=0.381 [cnode7-012:0/16] 2024-12-07 05:28:59,374 (deepspeed_trainer:228) INFO: 20epoch:train:9501-9600batch: iter_time=1.110e-04, loss_ctc=76.873, loss_att=61.997, acc=0.697, loss=66.462, grad_norm=4.470, loss_scale=1.000, learning_rate=9.027e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 05:29:36,541 (deepspeed_trainer:228) INFO: 20epoch:train:9601-9700batch: iter_time=1.135e-04, loss_ctc=68.417, loss_att=53.003, acc=0.716, loss=57.626, grad_norm=4.103, loss_scale=1.000, learning_rate=9.025e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:30:13,365 (deepspeed_trainer:228) INFO: 20epoch:train:9701-9800batch: iter_time=1.122e-04, loss_ctc=69.983, loss_att=52.102, acc=0.695, loss=57.474, grad_norm=4.807, loss_scale=1.000, learning_rate=9.024e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 05:30:50,691 (deepspeed_trainer:228) INFO: 20epoch:train:9801-9900batch: iter_time=1.123e-04, loss_ctc=68.730, loss_att=50.256, acc=0.715, loss=55.796, grad_norm=4.503, loss_scale=1.000, learning_rate=9.022e-05, step_time=0.368 [2024-12-07 05:31:27,679] [INFO] [logging.py:129:log_dist] [Rank 0] step=295000, skipped=0, lr=[np.float64(9.019737048356989e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:31:27,680] [INFO] [timer.py:264:stop] epoch=0/micro_step=40000/global_step=40000, RunningAvgSamplesPerSec=43.86866426291861, CurrSamplesPerSec=45.360586099224975, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:31:27,682 (deepspeed_trainer:228) INFO: 20epoch:train:9901-10000batch: iter_time=1.082e-04, loss_ctc=76.301, loss_att=60.005, acc=0.703, loss=64.891, grad_norm=4.490, loss_scale=1.000, learning_rate=9.020e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:32:04,961 (deepspeed_trainer:228) INFO: 20epoch:train:10001-10100batch: iter_time=1.059e-04, loss_ctc=82.951, loss_att=64.384, acc=0.697, loss=69.943, grad_norm=5.273, loss_scale=1.000, learning_rate=9.019e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 05:32:41,808 (deepspeed_trainer:228) INFO: 20epoch:train:10101-10200batch: iter_time=1.038e-04, loss_ctc=80.281, loss_att=61.057, acc=0.697, loss=66.829, grad_norm=4.534, loss_scale=1.000, learning_rate=9.017e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 05:33:18,410 (deepspeed_trainer:228) INFO: 20epoch:train:10201-10300batch: iter_time=1.054e-04, loss_ctc=68.517, loss_att=52.215, acc=0.708, loss=57.087, grad_norm=4.373, loss_scale=1.000, learning_rate=9.016e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 05:33:55,607 (deepspeed_trainer:228) INFO: 20epoch:train:10301-10400batch: iter_time=1.052e-04, loss_ctc=75.188, loss_att=59.688, acc=0.701, loss=64.362, grad_norm=4.532, loss_scale=1.000, learning_rate=9.014e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 05:34:32,376 (deepspeed_trainer:228) INFO: 20epoch:train:10401-10500batch: iter_time=1.059e-04, loss_ctc=72.296, loss_att=54.144, acc=0.702, loss=59.590, grad_norm=4.641, loss_scale=1.000, learning_rate=9.013e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 05:35:09,356 (deepspeed_trainer:228) INFO: 20epoch:train:10501-10600batch: iter_time=1.059e-04, loss_ctc=75.764, loss_att=61.090, acc=0.686, loss=65.483, grad_norm=4.494, loss_scale=1.000, learning_rate=9.011e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:35:46,323 (deepspeed_trainer:228) INFO: 20epoch:train:10601-10700batch: iter_time=1.056e-04, loss_ctc=75.090, loss_att=56.289, acc=0.691, loss=61.946, grad_norm=4.829, loss_scale=1.000, learning_rate=9.010e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:36:23,239 (deepspeed_trainer:228) INFO: 20epoch:train:10701-10800batch: iter_time=1.082e-04, loss_ctc=63.881, loss_att=47.932, acc=0.717, loss=52.695, grad_norm=4.058, loss_scale=1.000, learning_rate=9.008e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:37:00,731 (deepspeed_trainer:228) INFO: 20epoch:train:10801-10900batch: iter_time=1.059e-04, loss_ctc=83.446, loss_att=58.103, acc=0.697, loss=65.737, grad_norm=4.941, loss_scale=1.000, learning_rate=9.007e-05, step_time=0.375 [2024-12-07 05:37:37,854] [INFO] [logging.py:129:log_dist] [Rank 0] step=296000, skipped=0, lr=[np.float64(9.004488167543743e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:37:37,855] [INFO] [timer.py:264:stop] epoch=0/micro_step=41000/global_step=41000, RunningAvgSamplesPerSec=43.879715289406896, CurrSamplesPerSec=43.835924700017955, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:37:37,856 (deepspeed_trainer:228) INFO: 20epoch:train:10901-11000batch: iter_time=1.043e-04, loss_ctc=74.467, loss_att=62.818, acc=0.681, loss=66.310, grad_norm=5.273, loss_scale=1.000, learning_rate=9.005e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:38:14,810 (deepspeed_trainer:228) INFO: 20epoch:train:11001-11100batch: iter_time=1.047e-04, loss_ctc=71.145, loss_att=53.645, acc=0.694, loss=58.903, grad_norm=4.204, loss_scale=1.000, learning_rate=9.004e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:38:52,127 (deepspeed_trainer:228) INFO: 20epoch:train:11101-11200batch: iter_time=1.047e-04, loss_ctc=78.218, loss_att=67.548, acc=0.684, loss=70.757, grad_norm=5.457, loss_scale=1.000, learning_rate=9.002e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 05:39:15,737 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 05:39:43,216 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 05:40:00,185 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 05:40:00,185 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 05:40:00,187 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 05:40:23,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:24,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:21,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:21,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:25,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:22,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:22,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:23,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:23,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:23,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:23,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:27,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:27,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:28,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:28,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:40:28,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:09,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:10,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:10,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:11,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:14,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:12,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:12,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:15,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:15,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:12,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:15,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:13,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:16,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:16,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:17,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:24,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:01,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:59,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:59,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:41:59,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:00,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:03,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:01,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:01,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:01,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:05,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:05,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:05,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:05,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:06,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:03,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:15,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:46,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:47,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:48,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:48,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:49,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:52,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:49,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:52,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:53,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:54,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:51,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:54,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:53,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:56,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:42:56,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:43:07,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 05:43:50,624 (deepspeed_trainer:228) INFO: 20epoch:train:11201-11300batch: iter_time=2.614, loss_ctc=69.775, loss_att=55.594, acc=0.698, loss=59.858, grad_norm=4.628, loss_scale=1.000, learning_rate=9.001e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:44:28,104 (deepspeed_trainer:228) INFO: 20epoch:train:11301-11400batch: iter_time=1.133e-04, loss_ctc=79.553, loss_att=52.274, acc=0.715, loss=60.477, grad_norm=4.913, loss_scale=1.000, learning_rate=8.999e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 05:45:05,085 (deepspeed_trainer:228) INFO: 20epoch:train:11401-11500batch: iter_time=1.132e-04, loss_ctc=71.979, loss_att=58.664, acc=0.700, loss=62.678, grad_norm=5.068, loss_scale=1.000, learning_rate=8.998e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:45:41,861 (deepspeed_trainer:228) INFO: 20epoch:train:11501-11600batch: iter_time=1.203e-04, loss_ctc=74.191, loss_att=57.307, acc=0.714, loss=62.383, grad_norm=4.254, loss_scale=1.000, learning_rate=8.996e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 05:46:18,499 (deepspeed_trainer:228) INFO: 20epoch:train:11601-11700batch: iter_time=1.230e-04, loss_ctc=66.729, loss_att=47.703, acc=0.709, loss=53.412, grad_norm=4.796, loss_scale=1.000, learning_rate=8.995e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 05:46:55,433 (deepspeed_trainer:228) INFO: 20epoch:train:11701-11800batch: iter_time=1.159e-04, loss_ctc=70.028, loss_att=53.293, acc=0.711, loss=58.292, grad_norm=4.298, loss_scale=1.000, learning_rate=8.993e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:47:32,191 (deepspeed_trainer:228) INFO: 20epoch:train:11801-11900batch: iter_time=1.151e-04, loss_ctc=70.448, loss_att=51.495, acc=0.714, loss=57.171, grad_norm=4.420, loss_scale=1.000, learning_rate=8.992e-05, step_time=0.367 [2024-12-07 05:48:09,271] [INFO] [logging.py:129:log_dist] [Rank 0] step=297000, skipped=0, lr=[np.float64(8.989316365993398e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:48:09,272] [INFO] [timer.py:264:stop] epoch=0/micro_step=42000/global_step=42000, RunningAvgSamplesPerSec=43.89047007348258, CurrSamplesPerSec=46.37103332256973, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:48:09,274 (deepspeed_trainer:228) INFO: 20epoch:train:11901-12000batch: iter_time=1.150e-04, loss_ctc=84.941, loss_att=66.644, acc=0.692, loss=72.144, grad_norm=6.109, loss_scale=1.000, learning_rate=8.990e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:48:46,325 (deepspeed_trainer:228) INFO: 20epoch:train:12001-12100batch: iter_time=1.139e-04, loss_ctc=83.915, loss_att=66.049, acc=0.692, loss=71.402, grad_norm=4.911, loss_scale=1.000, learning_rate=8.989e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:49:23,317 (deepspeed_trainer:228) INFO: 20epoch:train:12101-12200batch: iter_time=1.130e-04, loss_ctc=65.681, loss_att=49.017, acc=0.708, loss=54.008, grad_norm=4.555, loss_scale=1.000, learning_rate=8.987e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:50:00,637 (deepspeed_trainer:228) INFO: 20epoch:train:12201-12300batch: iter_time=1.120e-04, loss_ctc=75.853, loss_att=61.301, acc=0.701, loss=65.666, grad_norm=4.538, loss_scale=1.000, learning_rate=8.986e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 05:50:37,456 (deepspeed_trainer:228) INFO: 20epoch:train:12301-12400batch: iter_time=1.105e-04, loss_ctc=71.542, loss_att=53.448, acc=0.699, loss=58.872, grad_norm=4.272, loss_scale=1.000, learning_rate=8.984e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 05:51:14,484 (deepspeed_trainer:228) INFO: 20epoch:train:12401-12500batch: iter_time=1.123e-04, loss_ctc=73.284, loss_att=57.542, acc=0.696, loss=62.281, grad_norm=4.332, loss_scale=1.000, learning_rate=8.983e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:51:51,283 (deepspeed_trainer:228) INFO: 20epoch:train:12501-12600batch: iter_time=1.157e-04, loss_ctc=70.783, loss_att=52.877, acc=0.693, loss=58.230, grad_norm=4.683, loss_scale=1.000, learning_rate=8.981e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 05:52:28,456 (deepspeed_trainer:228) INFO: 20epoch:train:12601-12700batch: iter_time=1.190e-04, loss_ctc=75.006, loss_att=55.478, acc=0.718, loss=61.320, grad_norm=4.280, loss_scale=1.000, learning_rate=8.979e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 05:53:05,455 (deepspeed_trainer:228) INFO: 20epoch:train:12701-12800batch: iter_time=1.159e-04, loss_ctc=79.469, loss_att=56.909, acc=0.688, loss=63.678, grad_norm=5.199, loss_scale=1.000, learning_rate=8.978e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 05:53:42,407 (deepspeed_trainer:228) INFO: 20epoch:train:12801-12900batch: iter_time=1.170e-04, loss_ctc=71.909, loss_att=59.575, acc=0.689, loss=63.289, grad_norm=4.659, loss_scale=1.000, learning_rate=8.976e-05, step_time=0.369 [2024-12-07 05:54:19,396] [INFO] [logging.py:129:log_dist] [Rank 0] step=298000, skipped=0, lr=[np.float64(8.974220996526009e-05)], mom=[[0.9, 0.98]] [2024-12-07 05:54:19,396] [INFO] [timer.py:264:stop] epoch=0/micro_step=43000/global_step=43000, RunningAvgSamplesPerSec=43.90008200077271, CurrSamplesPerSec=42.54502027965214, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 05:54:19,398 (deepspeed_trainer:228) INFO: 20epoch:train:12901-13000batch: iter_time=1.173e-04, loss_ctc=72.022, loss_att=54.687, acc=0.692, loss=59.918, grad_norm=4.775, loss_scale=1.000, learning_rate=8.975e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 05:54:56,849 (deepspeed_trainer:228) INFO: 20epoch:train:13001-13100batch: iter_time=1.155e-04, loss_ctc=75.902, loss_att=66.223, acc=0.687, loss=69.138, grad_norm=4.589, loss_scale=1.000, learning_rate=8.973e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 05:55:09,785 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 05:55:37,092 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 05:55:53,161 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 05:55:53,161 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 05:55:53,164 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 05:56:18,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:16,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:19,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:17,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:17,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:20,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:18,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:18,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:21,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:18,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:21,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:19,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:21,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:22,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:19,231] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:56:22,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:06,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:07,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:05,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:08,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:06,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:06,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:07,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:09,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:10,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:07,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:08,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:11,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:08,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:08,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:11,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:11,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:53,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:55,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:53,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:56,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:54,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:55,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:55,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:58,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:56,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:59,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:59,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:00,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:00,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:57:59,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:00,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:00,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:40,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:42,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:44,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:41,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:42,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:44,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:47,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:44,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:48,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:48,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:48,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:47,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:48,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:51,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:50,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 05:58:50,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 05:59:47,440 (deepspeed_trainer:228) INFO: 20epoch:train:13101-13200batch: iter_time=2.510, loss_ctc=67.791, loss_att=51.788, acc=0.709, loss=56.600, grad_norm=4.421, loss_scale=1.000, learning_rate=8.972e-05, step_time=0.396 [cnode7-012:0/16] 2024-12-07 06:00:25,699 (deepspeed_trainer:228) INFO: 20epoch:train:13201-13300batch: iter_time=1.086e-04, loss_ctc=85.837, loss_att=64.538, acc=0.708, loss=70.899, grad_norm=5.212, loss_scale=1.000, learning_rate=8.970e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 06:01:03,187 (deepspeed_trainer:228) INFO: 20epoch:train:13301-13400batch: iter_time=1.059e-04, loss_ctc=69.263, loss_att=53.065, acc=0.712, loss=57.933, grad_norm=4.678, loss_scale=1.000, learning_rate=8.969e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 06:01:40,425 (deepspeed_trainer:228) INFO: 20epoch:train:13401-13500batch: iter_time=1.085e-04, loss_ctc=65.862, loss_att=51.710, acc=0.721, loss=55.979, grad_norm=4.055, loss_scale=1.000, learning_rate=8.967e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:02:17,493 (deepspeed_trainer:228) INFO: 20epoch:train:13501-13600batch: iter_time=1.064e-04, loss_ctc=69.033, loss_att=52.473, acc=0.707, loss=57.441, grad_norm=4.808, loss_scale=1.000, learning_rate=8.966e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 06:02:54,787 (deepspeed_trainer:228) INFO: 20epoch:train:13601-13700batch: iter_time=1.103e-04, loss_ctc=73.984, loss_att=58.920, acc=0.705, loss=63.433, grad_norm=4.880, loss_scale=1.000, learning_rate=8.964e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:03:31,914 (deepspeed_trainer:228) INFO: 20epoch:train:13701-13800batch: iter_time=1.056e-04, loss_ctc=72.181, loss_att=51.986, acc=0.718, loss=58.047, grad_norm=4.283, loss_scale=1.000, learning_rate=8.963e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:04:09,514 (deepspeed_trainer:228) INFO: 20epoch:train:13801-13900batch: iter_time=1.086e-04, loss_ctc=86.874, loss_att=67.627, acc=0.696, loss=73.423, grad_norm=5.710, loss_scale=1.000, learning_rate=8.961e-05, step_time=0.376 [2024-12-07 06:04:47,249] [INFO] [logging.py:129:log_dist] [Rank 0] step=299000, skipped=0, lr=[np.float64(8.959201419543648e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:04:47,250] [INFO] [timer.py:264:stop] epoch=0/micro_step=44000/global_step=44000, RunningAvgSamplesPerSec=43.890574745486525, CurrSamplesPerSec=41.01682386501504, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:04:47,251 (deepspeed_trainer:228) INFO: 20epoch:train:13901-14000batch: iter_time=1.071e-04, loss_ctc=81.386, loss_att=62.836, acc=0.711, loss=68.374, grad_norm=5.067, loss_scale=1.000, learning_rate=8.960e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 06:05:24,550 (deepspeed_trainer:228) INFO: 20epoch:train:14001-14100batch: iter_time=1.068e-04, loss_ctc=62.742, loss_att=50.921, acc=0.700, loss=54.468, grad_norm=4.352, loss_scale=1.000, learning_rate=8.958e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:06:01,931 (deepspeed_trainer:228) INFO: 20epoch:train:14101-14200batch: iter_time=1.065e-04, loss_ctc=71.597, loss_att=58.761, acc=0.704, loss=62.626, grad_norm=4.493, loss_scale=1.000, learning_rate=8.957e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:06:39,817 (deepspeed_trainer:228) INFO: 20epoch:train:14201-14300batch: iter_time=1.070e-04, loss_ctc=77.291, loss_att=63.601, acc=0.689, loss=67.685, grad_norm=4.751, loss_scale=1.000, learning_rate=8.955e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 06:07:17,207 (deepspeed_trainer:228) INFO: 20epoch:train:14301-14400batch: iter_time=1.108e-04, loss_ctc=73.916, loss_att=59.518, acc=0.697, loss=63.842, grad_norm=4.557, loss_scale=1.000, learning_rate=8.954e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 06:07:54,444 (deepspeed_trainer:228) INFO: 20epoch:train:14401-14500batch: iter_time=1.079e-04, loss_ctc=65.089, loss_att=47.723, acc=0.709, loss=52.913, grad_norm=4.502, loss_scale=1.000, learning_rate=8.952e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 06:08:32,078 (deepspeed_trainer:228) INFO: 20epoch:train:14501-14600batch: iter_time=1.098e-04, loss_ctc=79.226, loss_att=58.328, acc=0.714, loss=64.597, grad_norm=4.636, loss_scale=1.000, learning_rate=8.951e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 06:09:09,546 (deepspeed_trainer:228) INFO: 20epoch:train:14601-14700batch: iter_time=1.107e-04, loss_ctc=80.793, loss_att=58.355, acc=0.690, loss=65.101, grad_norm=5.634, loss_scale=1.000, learning_rate=8.949e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 06:09:46,802 (deepspeed_trainer:228) INFO: 20epoch:train:14701-14800batch: iter_time=1.112e-04, loss_ctc=70.028, loss_att=58.180, acc=0.696, loss=61.747, grad_norm=4.749, loss_scale=1.000, learning_rate=8.948e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 06:10:24,478 (deepspeed_trainer:228) INFO: 20epoch:train:14801-14900batch: iter_time=1.104e-04, loss_ctc=74.635, loss_att=61.874, acc=0.697, loss=65.696, grad_norm=4.633, loss_scale=1.000, learning_rate=8.946e-05, step_time=0.376 [2024-12-07 06:11:02,528] [INFO] [logging.py:129:log_dist] [Rank 0] step=300000, skipped=0, lr=[np.float64(8.944257002916577e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:11:02,528] [INFO] [timer.py:264:stop] epoch=0/micro_step=45000/global_step=45000, RunningAvgSamplesPerSec=43.88622775353046, CurrSamplesPerSec=45.65905360676839, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:11:02,530 (deepspeed_trainer:228) INFO: 20epoch:train:14901-15000batch: iter_time=1.072e-04, loss_ctc=71.469, loss_att=62.894, acc=0.689, loss=65.444, grad_norm=4.604, loss_scale=1.000, learning_rate=8.945e-05, step_time=0.380 [2024-12-07 06:11:17,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:17,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:17,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:18,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:15,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:18,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:18,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:18,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:19,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:32,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:30,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:31,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:34,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:35,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:32,913] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:36,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:36,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:36,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:33,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:48,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:49,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:46,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:49,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:47,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:50,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:48,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:48,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:51,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:52,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:49,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:49,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:52,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:49,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:52,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:11:49,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:04,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:01,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:04,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:05,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:03,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:06,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:03,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:04,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:04,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:07,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:07,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:05,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:05,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:08,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:08,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:05,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:12:21,136] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 20 is about to be saved! [2024-12-07 06:12:21,167] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/mp_rank_00_model_states.pt [2024-12-07 06:12:21,167] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/mp_rank_00_model_states.pt... [2024-12-07 06:12:23,144] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/mp_rank_00_model_states.pt. [2024-12-07 06:12:23,300] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,301] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,304] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,304] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,350] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,350] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,350] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,307] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,351] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,351] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 06:12:23,307] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 06:12:20,352] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 06:12:24,038] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 06:12:24,039] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 06:12:24,039] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:24,062] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 06:12:24,062] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 06:12:24,062] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:24,063] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 06:12:24,064] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 06:12:24,064] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,138] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,138] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,138] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,138] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,138] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,138] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,158] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,158] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,158] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,162] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,162] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,162] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,175] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,175] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,175] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:24,158] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 06:12:24,158] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 06:12:24,158] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:24,163] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,207] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,207] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,207] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,218] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,218] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,218] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:21,222] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 06:12:21,222] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 06:12:21,222] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:24,320] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 06:12:24,320] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:24,924] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 06:12:24,924] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 06:12:24,925] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:25,261] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 06:12:25,262] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 06:12:25,262] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [2024-12-07 06:12:25,464] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 06:12:25,464] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_20/20/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 06:12:25,464] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 20 is ready now! [cnode7-012:0/16] 2024-12-07 06:12:25,472 (deepspeed_trainer:158) INFO: 20epoch results: [train] iter_time=0.136, loss_ctc=74.767, loss_att=57.809, acc=0.700, loss=62.898, grad_norm=4.711, loss_scale=1.000, learning_rate=9.059e-05, step_time=0.373, time=2 hours, 7 minutes and 23.68 seconds, total_count=300020, gpu_max_cached_mem_GB=27.986, [valid] loss_ctc=4.781, cer_ctc=0.115, loss_att=8.812, acc=0.768, cer=0.433, wer=1.000, loss=7.594, time=1 minute and 9.88 seconds, total_count=20, gpu_max_cached_mem_GB=27.986 [cnode7-012:0/16] 2024-12-07 06:12:27,215 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 06:12:53,986 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 06:13:09,312 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 06:13:09,312 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 06:13:09,315 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 06:13:29,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:30,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:27,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:27,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:31,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:31,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:32,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:29,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:32,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:29,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:29,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:29,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:29,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:33,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:33,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:13:30,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:16,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:17,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:15,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:15,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:18,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:19,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:16,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:20,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:17,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:17,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:20,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:17,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:20,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:20,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:18,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:14:19,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:03,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:04,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:03,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:06,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:03,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:06,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:03,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:07,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:04,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:04,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:07,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:08,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:08,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:05,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:06,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:07,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:50,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:52,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:50,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:51,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:54,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:54,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:51,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:52,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:55,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:53,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:56,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:56,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:53,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:54,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:57,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:15:55,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 06:17:02,039 (deepspeed_trainer:228) INFO: 21epoch:train:1-100batch: iter_time=2.356, loss_ctc=79.152, loss_att=67.100, acc=0.684, loss=70.725, grad_norm=4.740, loss_scale=1.000, learning_rate=8.944e-05, step_time=0.392 [cnode7-012:0/16] 2024-12-07 06:17:39,502 (deepspeed_trainer:228) INFO: 21epoch:train:101-200batch: iter_time=1.096e-04, loss_ctc=64.508, loss_att=54.186, acc=0.682, loss=57.298, grad_norm=4.816, loss_scale=1.000, learning_rate=8.942e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 06:18:16,839 (deepspeed_trainer:228) INFO: 21epoch:train:201-300batch: iter_time=1.081e-04, loss_ctc=84.239, loss_att=60.857, acc=0.695, loss=67.856, grad_norm=5.314, loss_scale=1.000, learning_rate=8.941e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 06:18:53,946 (deepspeed_trainer:228) INFO: 21epoch:train:301-400batch: iter_time=1.077e-04, loss_ctc=69.666, loss_att=54.815, acc=0.698, loss=59.288, grad_norm=4.374, loss_scale=1.000, learning_rate=8.939e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:19:30,883 (deepspeed_trainer:228) INFO: 21epoch:train:401-500batch: iter_time=1.099e-04, loss_ctc=71.704, loss_att=53.395, acc=0.709, loss=58.881, grad_norm=4.362, loss_scale=1.000, learning_rate=8.938e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:20:08,180 (deepspeed_trainer:228) INFO: 21epoch:train:501-600batch: iter_time=1.097e-04, loss_ctc=70.618, loss_att=54.078, acc=0.709, loss=59.081, grad_norm=4.443, loss_scale=1.000, learning_rate=8.936e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 06:20:45,338 (deepspeed_trainer:228) INFO: 21epoch:train:601-700batch: iter_time=1.107e-04, loss_ctc=84.137, loss_att=62.139, acc=0.698, loss=68.736, grad_norm=5.244, loss_scale=1.000, learning_rate=8.935e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:21:22,483 (deepspeed_trainer:228) INFO: 21epoch:train:701-800batch: iter_time=1.083e-04, loss_ctc=75.581, loss_att=58.552, acc=0.702, loss=63.664, grad_norm=5.278, loss_scale=1.000, learning_rate=8.933e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:21:59,921 (deepspeed_trainer:228) INFO: 21epoch:train:801-900batch: iter_time=1.128e-04, loss_ctc=84.071, loss_att=63.828, acc=0.692, loss=69.924, grad_norm=5.227, loss_scale=1.000, learning_rate=8.932e-05, step_time=0.374 [2024-12-07 06:22:37,148] [INFO] [logging.py:129:log_dist] [Rank 0] step=301000, skipped=0, lr=[np.float64(8.929387121871501e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:22:37,148] [INFO] [timer.py:264:stop] epoch=0/micro_step=46000/global_step=46000, RunningAvgSamplesPerSec=43.88455041234193, CurrSamplesPerSec=44.28589696478586, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:22:37,150 (deepspeed_trainer:228) INFO: 21epoch:train:901-1000batch: iter_time=1.110e-04, loss_ctc=75.116, loss_att=56.613, acc=0.699, loss=62.167, grad_norm=4.836, loss_scale=1.000, learning_rate=8.930e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 06:23:14,800 (deepspeed_trainer:228) INFO: 21epoch:train:1001-1100batch: iter_time=1.145e-04, loss_ctc=82.317, loss_att=58.119, acc=0.688, loss=65.398, grad_norm=5.291, loss_scale=1.000, learning_rate=8.929e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 06:23:52,529 (deepspeed_trainer:228) INFO: 21epoch:train:1101-1200batch: iter_time=1.104e-04, loss_ctc=79.314, loss_att=56.338, acc=0.705, loss=63.232, grad_norm=5.017, loss_scale=1.000, learning_rate=8.927e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 06:24:29,654 (deepspeed_trainer:228) INFO: 21epoch:train:1201-1300batch: iter_time=1.151e-04, loss_ctc=79.106, loss_att=58.957, acc=0.700, loss=65.022, grad_norm=5.704, loss_scale=1.000, learning_rate=8.926e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:25:06,786 (deepspeed_trainer:228) INFO: 21epoch:train:1301-1400batch: iter_time=1.123e-04, loss_ctc=72.948, loss_att=55.764, acc=0.702, loss=60.922, grad_norm=4.645, loss_scale=1.000, learning_rate=8.924e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:25:43,979 (deepspeed_trainer:228) INFO: 21epoch:train:1401-1500batch: iter_time=1.131e-04, loss_ctc=81.877, loss_att=62.939, acc=0.697, loss=68.599, grad_norm=5.174, loss_scale=1.000, learning_rate=8.923e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:26:20,832 (deepspeed_trainer:228) INFO: 21epoch:train:1501-1600batch: iter_time=1.090e-04, loss_ctc=66.860, loss_att=46.167, acc=0.727, loss=52.373, grad_norm=4.182, loss_scale=1.000, learning_rate=8.921e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 06:26:58,128 (deepspeed_trainer:228) INFO: 21epoch:train:1601-1700batch: iter_time=1.139e-04, loss_ctc=71.248, loss_att=57.758, acc=0.703, loss=61.804, grad_norm=5.345, loss_scale=1.000, learning_rate=8.920e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:27:36,136 (deepspeed_trainer:228) INFO: 21epoch:train:1701-1800batch: iter_time=1.079e-04, loss_ctc=89.737, loss_att=68.413, acc=0.688, loss=74.814, grad_norm=6.245, loss_scale=1.000, learning_rate=8.918e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 06:28:07,630 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 06:28:33,344 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 06:28:50,705 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 06:28:50,706 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 06:28:50,708 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 06:29:15,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:16,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:16,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:19,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:19,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:19,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:21,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:21,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:29:22,913] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:05,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:05,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:05,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:08,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:08,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:10,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:11,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:11,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:15,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:15,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:17,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:17,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:17,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:17,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:17,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:17,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:54,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:54,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:54,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:56,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:30:57,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:00,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:00,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:01,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:03,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:04,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:04,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:05,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:05,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:05,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:08,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:08,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:45,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:45,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:45,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:45,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:46,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:48,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:49,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:50,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:52,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:52,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:53,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:54,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:54,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:54,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:56,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:31:59,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 06:32:38,212 (deepspeed_trainer:228) INFO: 21epoch:train:1801-1900batch: iter_time=2.505, loss_ctc=74.369, loss_att=60.458, acc=0.694, loss=64.591, grad_norm=4.831, loss_scale=1.000, learning_rate=8.917e-05, step_time=0.515 [2024-12-07 06:33:16,459] [INFO] [logging.py:129:log_dist] [Rank 0] step=302000, skipped=0, lr=[np.float64(8.914591158881871e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:33:16,462] [INFO] [timer.py:264:stop] epoch=0/micro_step=47000/global_step=47000, RunningAvgSamplesPerSec=43.84639194921396, CurrSamplesPerSec=45.038502669966896, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:33:16,463 (deepspeed_trainer:228) INFO: 21epoch:train:1901-2000batch: iter_time=1.212e-04, loss_ctc=77.269, loss_att=63.654, acc=0.694, loss=67.775, grad_norm=5.019, loss_scale=1.000, learning_rate=8.915e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 06:33:53,267 (deepspeed_trainer:228) INFO: 21epoch:train:2001-2100batch: iter_time=1.105e-04, loss_ctc=67.813, loss_att=53.726, acc=0.693, loss=57.951, grad_norm=4.880, loss_scale=1.000, learning_rate=8.914e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 06:34:30,676 (deepspeed_trainer:228) INFO: 21epoch:train:2101-2200batch: iter_time=1.165e-04, loss_ctc=76.397, loss_att=58.820, acc=0.691, loss=64.082, grad_norm=5.377, loss_scale=1.000, learning_rate=8.912e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 06:35:08,036 (deepspeed_trainer:228) INFO: 21epoch:train:2201-2300batch: iter_time=1.169e-04, loss_ctc=65.689, loss_att=48.178, acc=0.706, loss=53.418, grad_norm=4.464, loss_scale=1.000, learning_rate=8.911e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:35:45,624 (deepspeed_trainer:228) INFO: 21epoch:train:2301-2400batch: iter_time=1.160e-04, loss_ctc=74.597, loss_att=56.242, acc=0.719, loss=61.761, grad_norm=4.261, loss_scale=1.000, learning_rate=8.909e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 06:36:23,269 (deepspeed_trainer:228) INFO: 21epoch:train:2401-2500batch: iter_time=1.178e-04, loss_ctc=76.895, loss_att=56.685, acc=0.704, loss=62.746, grad_norm=4.895, loss_scale=1.000, learning_rate=8.908e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 06:37:01,325 (deepspeed_trainer:228) INFO: 21epoch:train:2501-2600batch: iter_time=1.172e-04, loss_ctc=79.419, loss_att=63.122, acc=0.697, loss=68.008, grad_norm=4.235, loss_scale=1.000, learning_rate=8.906e-05, step_time=0.381 [cnode7-012:0/16] 2024-12-07 06:37:38,499 (deepspeed_trainer:228) INFO: 21epoch:train:2601-2700batch: iter_time=1.167e-04, loss_ctc=75.747, loss_att=55.315, acc=0.710, loss=61.446, grad_norm=4.752, loss_scale=1.000, learning_rate=8.905e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:38:15,045 (deepspeed_trainer:228) INFO: 21epoch:train:2701-2800batch: iter_time=1.116e-04, loss_ctc=78.537, loss_att=60.294, acc=0.700, loss=65.759, grad_norm=4.888, loss_scale=1.000, learning_rate=8.904e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 06:38:51,731 (deepspeed_trainer:228) INFO: 21epoch:train:2801-2900batch: iter_time=1.106e-04, loss_ctc=80.422, loss_att=54.460, acc=0.700, loss=62.272, grad_norm=4.502, loss_scale=1.000, learning_rate=8.902e-05, step_time=0.366 [2024-12-07 06:39:28,315] [INFO] [logging.py:129:log_dist] [Rank 0] step=303000, skipped=0, lr=[np.float64(8.899868503560157e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:39:28,316] [INFO] [timer.py:264:stop] epoch=0/micro_step=48000/global_step=48000, RunningAvgSamplesPerSec=43.85334815857371, CurrSamplesPerSec=44.73607552239748, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:39:28,317 (deepspeed_trainer:228) INFO: 21epoch:train:2901-3000batch: iter_time=1.125e-04, loss_ctc=76.054, loss_att=56.914, acc=0.695, loss=62.668, grad_norm=5.017, loss_scale=1.000, learning_rate=8.901e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 06:40:05,147 (deepspeed_trainer:228) INFO: 21epoch:train:3001-3100batch: iter_time=1.137e-04, loss_ctc=79.116, loss_att=61.485, acc=0.700, loss=66.817, grad_norm=4.913, loss_scale=1.000, learning_rate=8.899e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 06:40:41,822 (deepspeed_trainer:228) INFO: 21epoch:train:3101-3200batch: iter_time=1.089e-04, loss_ctc=70.614, loss_att=51.695, acc=0.712, loss=57.384, grad_norm=5.182, loss_scale=1.000, learning_rate=8.898e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 06:41:18,749 (deepspeed_trainer:228) INFO: 21epoch:train:3201-3300batch: iter_time=1.168e-04, loss_ctc=80.397, loss_att=59.785, acc=0.704, loss=65.941, grad_norm=4.912, loss_scale=1.000, learning_rate=8.896e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:41:55,695 (deepspeed_trainer:228) INFO: 21epoch:train:3301-3400batch: iter_time=1.113e-04, loss_ctc=74.692, loss_att=58.095, acc=0.708, loss=63.074, grad_norm=4.432, loss_scale=1.000, learning_rate=8.895e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:42:32,840 (deepspeed_trainer:228) INFO: 21epoch:train:3401-3500batch: iter_time=1.126e-04, loss_ctc=69.640, loss_att=54.166, acc=0.716, loss=58.824, grad_norm=4.068, loss_scale=1.000, learning_rate=8.893e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:43:09,638 (deepspeed_trainer:228) INFO: 21epoch:train:3501-3600batch: iter_time=1.111e-04, loss_ctc=72.636, loss_att=55.490, acc=0.697, loss=60.662, grad_norm=5.183, loss_scale=1.000, learning_rate=8.892e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 06:43:46,904 (deepspeed_trainer:228) INFO: 21epoch:train:3601-3700batch: iter_time=1.142e-04, loss_ctc=85.964, loss_att=63.763, acc=0.702, loss=70.376, grad_norm=5.359, loss_scale=1.000, learning_rate=8.890e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 06:44:10,812 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 06:44:37,211 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 06:44:54,309 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 06:44:54,309 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 06:44:54,312 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 06:45:16,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:16,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:17,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:18,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:18,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:18,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:18,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:18,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:21,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:21,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:21,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:22,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:23,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:24,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:24,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:45:27,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:04,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:05,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:05,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:05,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:06,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:06,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:06,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:09,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:10,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:10,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:11,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:09,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:12,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:12,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:14,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:15,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:52,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:52,677] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:53,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:55,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:55,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:55,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:55,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:58,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:59,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:59,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:01,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:02,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:46:59,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:02,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:04,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:05,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:40,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:40,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:41,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:42,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:42,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:45,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:44,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:44,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:47,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:48,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:50,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:48,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:51,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:52,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:53,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 06:47:54,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 06:48:39,043 (deepspeed_trainer:228) INFO: 21epoch:train:3701-3800batch: iter_time=2.546, loss_ctc=77.042, loss_att=67.304, acc=0.682, loss=70.234, grad_norm=4.874, loss_scale=1.000, learning_rate=8.889e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 06:49:16,174 (deepspeed_trainer:228) INFO: 21epoch:train:3801-3900batch: iter_time=1.100e-04, loss_ctc=66.361, loss_att=57.066, acc=0.684, loss=59.865, grad_norm=4.768, loss_scale=1.000, learning_rate=8.887e-05, step_time=0.371 [2024-12-07 06:49:53,477] [INFO] [logging.py:129:log_dist] [Rank 0] step=304000, skipped=0, lr=[np.float64(8.8852185525521e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:49:53,478] [INFO] [timer.py:264:stop] epoch=0/micro_step=49000/global_step=49000, RunningAvgSamplesPerSec=43.862226233100614, CurrSamplesPerSec=43.32088248200371, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:49:53,479 (deepspeed_trainer:228) INFO: 21epoch:train:3901-4000batch: iter_time=1.102e-04, loss_ctc=75.436, loss_att=57.754, acc=0.705, loss=63.054, grad_norm=4.897, loss_scale=1.000, learning_rate=8.886e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:50:30,515 (deepspeed_trainer:228) INFO: 21epoch:train:4001-4100batch: iter_time=1.086e-04, loss_ctc=71.352, loss_att=53.667, acc=0.694, loss=58.951, grad_norm=5.142, loss_scale=1.000, learning_rate=8.884e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 06:51:07,499 (deepspeed_trainer:228) INFO: 21epoch:train:4101-4200batch: iter_time=1.092e-04, loss_ctc=66.438, loss_att=48.936, acc=0.704, loss=54.194, grad_norm=4.222, loss_scale=1.000, learning_rate=8.883e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:51:45,053 (deepspeed_trainer:228) INFO: 21epoch:train:4201-4300batch: iter_time=1.135e-04, loss_ctc=74.461, loss_att=57.507, acc=0.708, loss=62.587, grad_norm=4.654, loss_scale=1.000, learning_rate=8.882e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 06:52:22,422 (deepspeed_trainer:228) INFO: 21epoch:train:4301-4400batch: iter_time=1.078e-04, loss_ctc=82.403, loss_att=63.101, acc=0.695, loss=68.932, grad_norm=5.019, loss_scale=1.000, learning_rate=8.880e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:52:59,394 (deepspeed_trainer:228) INFO: 21epoch:train:4401-4500batch: iter_time=1.072e-04, loss_ctc=75.288, loss_att=57.005, acc=0.698, loss=62.483, grad_norm=5.187, loss_scale=1.000, learning_rate=8.879e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:53:36,346 (deepspeed_trainer:228) INFO: 21epoch:train:4501-4600batch: iter_time=1.074e-04, loss_ctc=77.863, loss_att=58.977, acc=0.697, loss=64.628, grad_norm=5.194, loss_scale=1.000, learning_rate=8.877e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:54:13,515 (deepspeed_trainer:228) INFO: 21epoch:train:4601-4700batch: iter_time=1.107e-04, loss_ctc=72.726, loss_att=56.008, acc=0.695, loss=61.024, grad_norm=4.650, loss_scale=1.000, learning_rate=8.876e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:54:50,522 (deepspeed_trainer:228) INFO: 21epoch:train:4701-4800batch: iter_time=1.107e-04, loss_ctc=85.141, loss_att=56.346, acc=0.692, loss=65.012, grad_norm=5.143, loss_scale=1.000, learning_rate=8.874e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 06:55:27,367 (deepspeed_trainer:228) INFO: 21epoch:train:4801-4900batch: iter_time=1.141e-04, loss_ctc=71.642, loss_att=55.883, acc=0.697, loss=60.596, grad_norm=4.837, loss_scale=1.000, learning_rate=8.873e-05, step_time=0.368 [2024-12-07 06:56:04,426] [INFO] [logging.py:129:log_dist] [Rank 0] step=305000, skipped=0, lr=[np.float64(8.870640709432844e-05)], mom=[[0.9, 0.98]] [2024-12-07 06:56:04,426] [INFO] [timer.py:264:stop] epoch=0/micro_step=50000/global_step=50000, RunningAvgSamplesPerSec=43.87051245900706, CurrSamplesPerSec=45.531361033740616, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 06:56:04,428 (deepspeed_trainer:228) INFO: 21epoch:train:4901-5000batch: iter_time=1.111e-04, loss_ctc=80.972, loss_att=59.763, acc=0.694, loss=66.141, grad_norm=5.165, loss_scale=1.000, learning_rate=8.871e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 06:56:42,026 (deepspeed_trainer:228) INFO: 21epoch:train:5001-5100batch: iter_time=1.096e-04, loss_ctc=73.313, loss_att=55.319, acc=0.714, loss=60.696, grad_norm=4.393, loss_scale=1.000, learning_rate=8.870e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 06:57:19,188 (deepspeed_trainer:228) INFO: 21epoch:train:5101-5200batch: iter_time=1.080e-04, loss_ctc=77.876, loss_att=58.439, acc=0.696, loss=64.263, grad_norm=4.796, loss_scale=1.000, learning_rate=8.868e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 06:57:55,725 (deepspeed_trainer:228) INFO: 21epoch:train:5201-5300batch: iter_time=1.114e-04, loss_ctc=65.569, loss_att=49.582, acc=0.707, loss=54.401, grad_norm=3.693, loss_scale=1.000, learning_rate=8.867e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 06:58:32,671 (deepspeed_trainer:228) INFO: 21epoch:train:5301-5400batch: iter_time=1.133e-04, loss_ctc=73.055, loss_att=58.042, acc=0.706, loss=62.576, grad_norm=4.503, loss_scale=1.000, learning_rate=8.866e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 06:59:10,066 (deepspeed_trainer:228) INFO: 21epoch:train:5401-5500batch: iter_time=1.181e-04, loss_ctc=77.303, loss_att=59.821, acc=0.694, loss=65.058, grad_norm=5.416, loss_scale=1.000, learning_rate=8.864e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 06:59:47,362 (deepspeed_trainer:228) INFO: 21epoch:train:5501-5600batch: iter_time=1.115e-04, loss_ctc=77.845, loss_att=55.950, acc=0.703, loss=62.497, grad_norm=5.291, loss_scale=1.000, learning_rate=8.863e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:00:00,960 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 07:00:27,980 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 07:00:45,125 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 07:00:45,125 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 07:00:45,128 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 07:01:07,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:08,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:10,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:09,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:09,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:09,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:09,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:09,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:12,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:10,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:10,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:10,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:13,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:13,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:14,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:14,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:57,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:57,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:59,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:58,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:01:59,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:00,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:00,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:00,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:00,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:00,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:03,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:00,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:04,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:04,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:04,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:06,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:45,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:46,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:48,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:47,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:47,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:47,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:48,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:48,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:49,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:49,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:49,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:52,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:53,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:54,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:56,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:02:57,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:33,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:33,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:35,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:35,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:36,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:36,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:37,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:37,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:37,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:37,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:38,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:41,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:41,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:42,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:44,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:03:45,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 07:04:39,605 (deepspeed_trainer:228) INFO: 21epoch:train:5601-5700batch: iter_time=2.512, loss_ctc=83.153, loss_att=72.314, acc=0.679, loss=75.573, grad_norm=4.513, loss_scale=1.000, learning_rate=8.861e-05, step_time=0.410 [cnode7-012:0/16] 2024-12-07 07:05:16,499 (deepspeed_trainer:228) INFO: 21epoch:train:5701-5800batch: iter_time=1.053e-04, loss_ctc=63.861, loss_att=51.425, acc=0.689, loss=55.149, grad_norm=4.919, loss_scale=1.000, learning_rate=8.860e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 07:05:53,788 (deepspeed_trainer:228) INFO: 21epoch:train:5801-5900batch: iter_time=1.075e-04, loss_ctc=75.676, loss_att=53.331, acc=0.708, loss=60.056, grad_norm=4.253, loss_scale=1.000, learning_rate=8.858e-05, step_time=0.373 [2024-12-07 07:06:31,119] [INFO] [logging.py:129:log_dist] [Rank 0] step=306000, skipped=0, lr=[np.float64(8.856134384604958e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:06:31,120] [INFO] [timer.py:264:stop] epoch=0/micro_step=51000/global_step=51000, RunningAvgSamplesPerSec=43.8677878769661, CurrSamplesPerSec=43.843599928012665, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:06:31,122 (deepspeed_trainer:228) INFO: 21epoch:train:5901-6000batch: iter_time=1.097e-04, loss_ctc=72.470, loss_att=57.054, acc=0.692, loss=61.661, grad_norm=4.988, loss_scale=1.000, learning_rate=8.857e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:07:08,176 (deepspeed_trainer:228) INFO: 21epoch:train:6001-6100batch: iter_time=1.084e-04, loss_ctc=67.152, loss_att=47.377, acc=0.708, loss=53.316, grad_norm=4.433, loss_scale=1.000, learning_rate=8.855e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:07:45,853 (deepspeed_trainer:228) INFO: 21epoch:train:6101-6200batch: iter_time=1.101e-04, loss_ctc=72.101, loss_att=55.003, acc=0.714, loss=60.113, grad_norm=4.225, loss_scale=1.000, learning_rate=8.854e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:08:22,915 (deepspeed_trainer:228) INFO: 21epoch:train:6201-6300batch: iter_time=1.124e-04, loss_ctc=82.788, loss_att=61.004, acc=0.698, loss=67.571, grad_norm=5.206, loss_scale=1.000, learning_rate=8.853e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 07:09:00,038 (deepspeed_trainer:228) INFO: 21epoch:train:6301-6400batch: iter_time=1.075e-04, loss_ctc=70.427, loss_att=52.630, acc=0.707, loss=57.973, grad_norm=4.361, loss_scale=1.000, learning_rate=8.851e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 07:09:37,356 (deepspeed_trainer:228) INFO: 21epoch:train:6401-6500batch: iter_time=1.075e-04, loss_ctc=83.251, loss_att=65.438, acc=0.692, loss=70.792, grad_norm=5.176, loss_scale=1.000, learning_rate=8.850e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:10:14,573 (deepspeed_trainer:228) INFO: 21epoch:train:6501-6600batch: iter_time=1.083e-04, loss_ctc=68.543, loss_att=50.336, acc=0.705, loss=55.787, grad_norm=4.170, loss_scale=1.000, learning_rate=8.848e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:10:51,940 (deepspeed_trainer:228) INFO: 21epoch:train:6601-6700batch: iter_time=1.092e-04, loss_ctc=84.978, loss_att=57.625, acc=0.690, loss=65.835, grad_norm=5.177, loss_scale=1.000, learning_rate=8.847e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:11:29,268 (deepspeed_trainer:228) INFO: 21epoch:train:6701-6800batch: iter_time=1.076e-04, loss_ctc=76.399, loss_att=58.200, acc=0.701, loss=63.670, grad_norm=4.986, loss_scale=1.000, learning_rate=8.845e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:12:06,777 (deepspeed_trainer:228) INFO: 21epoch:train:6801-6900batch: iter_time=1.103e-04, loss_ctc=72.149, loss_att=52.361, acc=0.705, loss=58.311, grad_norm=4.806, loss_scale=1.000, learning_rate=8.844e-05, step_time=0.375 [2024-12-07 07:12:44,108] [INFO] [logging.py:129:log_dist] [Rank 0] step=307000, skipped=0, lr=[np.float64(8.841698995198274e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:12:44,109] [INFO] [timer.py:264:stop] epoch=0/micro_step=52000/global_step=52000, RunningAvgSamplesPerSec=43.87005523965017, CurrSamplesPerSec=43.112017108370104, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:12:44,110 (deepspeed_trainer:228) INFO: 21epoch:train:6901-7000batch: iter_time=1.095e-04, loss_ctc=75.440, loss_att=58.384, acc=0.705, loss=63.516, grad_norm=4.219, loss_scale=1.000, learning_rate=8.842e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:13:21,602 (deepspeed_trainer:228) INFO: 21epoch:train:7001-7100batch: iter_time=1.133e-04, loss_ctc=77.964, loss_att=57.619, acc=0.695, loss=63.729, grad_norm=4.607, loss_scale=1.000, learning_rate=8.841e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 07:13:58,870 (deepspeed_trainer:228) INFO: 21epoch:train:7101-7200batch: iter_time=1.117e-04, loss_ctc=67.076, loss_att=49.627, acc=0.721, loss=54.868, grad_norm=4.387, loss_scale=1.000, learning_rate=8.840e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:14:36,472 (deepspeed_trainer:228) INFO: 21epoch:train:7201-7300batch: iter_time=1.116e-04, loss_ctc=68.382, loss_att=56.384, acc=0.699, loss=59.971, grad_norm=4.753, loss_scale=1.000, learning_rate=8.838e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:15:14,057 (deepspeed_trainer:228) INFO: 21epoch:train:7301-7400batch: iter_time=1.142e-04, loss_ctc=78.605, loss_att=62.684, acc=0.696, loss=67.477, grad_norm=5.698, loss_scale=1.000, learning_rate=8.837e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:15:51,234 (deepspeed_trainer:228) INFO: 21epoch:train:7401-7500batch: iter_time=1.079e-04, loss_ctc=79.116, loss_att=59.183, acc=0.699, loss=65.176, grad_norm=5.469, loss_scale=1.000, learning_rate=8.835e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 07:15:55,649 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 07:16:23,044 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 07:16:39,606 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 07:16:39,606 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 07:16:39,609 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 07:17:03,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:03,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:01,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:04,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:01,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:01,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:02,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:03,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:03,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:03,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:03,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:08,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:08,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:09,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:09,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:09,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:51,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:49,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:49,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:52,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:52,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:50,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:51,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:52,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:52,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:52,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:53,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:57,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:58,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:59,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:17:59,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:00,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:39,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:38,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:38,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:41,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:39,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:42,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:39,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:40,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:40,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:40,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:43,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:47,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:48,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:49,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:49,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:18:50,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:26,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:25,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:27,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:25,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:26,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:27,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:28,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:28,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:28,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:32,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:34,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:36,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:33,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:37,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:37,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:19:37,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 07:20:41,822 (deepspeed_trainer:228) INFO: 21epoch:train:7501-7600batch: iter_time=2.516, loss_ctc=77.061, loss_att=65.572, acc=0.687, loss=68.999, grad_norm=4.707, loss_scale=1.000, learning_rate=8.834e-05, step_time=0.390 [cnode7-012:0/16] 2024-12-07 07:21:18,758 (deepspeed_trainer:228) INFO: 21epoch:train:7601-7700batch: iter_time=1.129e-04, loss_ctc=63.354, loss_att=52.041, acc=0.686, loss=55.427, grad_norm=4.573, loss_scale=1.000, learning_rate=8.832e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 07:21:55,943 (deepspeed_trainer:228) INFO: 21epoch:train:7701-7800batch: iter_time=1.138e-04, loss_ctc=80.427, loss_att=56.358, acc=0.702, loss=63.593, grad_norm=5.299, loss_scale=1.000, learning_rate=8.831e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:22:33,535 (deepspeed_trainer:228) INFO: 21epoch:train:7801-7900batch: iter_time=1.219e-04, loss_ctc=68.125, loss_att=54.023, acc=0.700, loss=58.272, grad_norm=4.815, loss_scale=1.000, learning_rate=8.829e-05, step_time=0.375 [2024-12-07 07:23:11,181] [INFO] [logging.py:129:log_dist] [Rank 0] step=308000, skipped=0, lr=[np.float64(8.827333964971521e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:23:11,181] [INFO] [timer.py:264:stop] epoch=0/micro_step=53000/global_step=53000, RunningAvgSamplesPerSec=43.86670041650123, CurrSamplesPerSec=43.39854040251847, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:23:11,183 (deepspeed_trainer:228) INFO: 21epoch:train:7901-8000batch: iter_time=1.160e-04, loss_ctc=69.673, loss_att=51.081, acc=0.713, loss=56.679, grad_norm=4.360, loss_scale=1.000, learning_rate=8.828e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:23:48,812 (deepspeed_trainer:228) INFO: 21epoch:train:8001-8100batch: iter_time=1.149e-04, loss_ctc=69.446, loss_att=52.390, acc=0.710, loss=57.492, grad_norm=4.199, loss_scale=1.000, learning_rate=8.827e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:24:26,444 (deepspeed_trainer:228) INFO: 21epoch:train:8101-8200batch: iter_time=1.144e-04, loss_ctc=80.382, loss_att=59.136, acc=0.704, loss=65.513, grad_norm=4.980, loss_scale=1.000, learning_rate=8.825e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:25:03,648 (deepspeed_trainer:228) INFO: 21epoch:train:8201-8300batch: iter_time=1.105e-04, loss_ctc=73.347, loss_att=55.750, acc=0.706, loss=61.040, grad_norm=3.926, loss_scale=1.000, learning_rate=8.824e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:25:41,059 (deepspeed_trainer:228) INFO: 21epoch:train:8301-8400batch: iter_time=1.083e-04, loss_ctc=81.524, loss_att=61.282, acc=0.692, loss=67.355, grad_norm=4.953, loss_scale=1.000, learning_rate=8.822e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 07:26:18,036 (deepspeed_trainer:228) INFO: 21epoch:train:8401-8500batch: iter_time=1.110e-04, loss_ctc=73.160, loss_att=53.887, acc=0.702, loss=59.653, grad_norm=4.559, loss_scale=1.000, learning_rate=8.821e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 07:26:55,010 (deepspeed_trainer:228) INFO: 21epoch:train:8501-8600batch: iter_time=1.092e-04, loss_ctc=78.699, loss_att=56.598, acc=0.690, loss=63.225, grad_norm=4.810, loss_scale=1.000, learning_rate=8.819e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 07:27:32,005 (deepspeed_trainer:228) INFO: 21epoch:train:8601-8700batch: iter_time=1.081e-04, loss_ctc=76.906, loss_att=54.216, acc=0.711, loss=61.029, grad_norm=4.409, loss_scale=1.000, learning_rate=8.818e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:28:09,075 (deepspeed_trainer:228) INFO: 21epoch:train:8701-8800batch: iter_time=1.104e-04, loss_ctc=75.967, loss_att=57.011, acc=0.702, loss=62.715, grad_norm=5.300, loss_scale=1.000, learning_rate=8.817e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:28:46,031 (deepspeed_trainer:228) INFO: 21epoch:train:8801-8900batch: iter_time=1.105e-04, loss_ctc=71.912, loss_att=54.050, acc=0.702, loss=59.405, grad_norm=4.951, loss_scale=1.000, learning_rate=8.815e-05, step_time=0.369 [2024-12-07 07:29:23,136] [INFO] [logging.py:129:log_dist] [Rank 0] step=309000, skipped=0, lr=[np.float64(8.813038724215712e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:29:23,137] [INFO] [timer.py:264:stop] epoch=0/micro_step=54000/global_step=54000, RunningAvgSamplesPerSec=43.8703022845333, CurrSamplesPerSec=45.04745149452554, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:29:23,138 (deepspeed_trainer:228) INFO: 21epoch:train:8901-9000batch: iter_time=1.100e-04, loss_ctc=80.625, loss_att=62.254, acc=0.693, loss=67.762, grad_norm=4.627, loss_scale=1.000, learning_rate=8.814e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 07:29:59,876 (deepspeed_trainer:228) INFO: 21epoch:train:9001-9100batch: iter_time=1.108e-04, loss_ctc=65.879, loss_att=45.204, acc=0.730, loss=51.406, grad_norm=4.263, loss_scale=1.000, learning_rate=8.812e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 07:30:36,703 (deepspeed_trainer:228) INFO: 21epoch:train:9101-9200batch: iter_time=1.092e-04, loss_ctc=69.994, loss_att=57.262, acc=0.702, loss=61.078, grad_norm=4.462, loss_scale=1.000, learning_rate=8.811e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 07:31:13,742 (deepspeed_trainer:228) INFO: 21epoch:train:9201-9300batch: iter_time=1.099e-04, loss_ctc=84.668, loss_att=66.129, acc=0.693, loss=71.703, grad_norm=5.830, loss_scale=1.000, learning_rate=8.809e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:31:45,926 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 07:32:13,332 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 07:32:29,817 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 07:32:29,817 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 07:32:29,819 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 07:32:53,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:54,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:54,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:53,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:57,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:54,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:54,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:54,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:54,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:58,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:55,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:58,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:55,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:55,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:59,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:32:59,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:43,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:43,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:44,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:42,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:43,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:43,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:44,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:45,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:48,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:48,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:46,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:49,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:50,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:47,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:50,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:33:48,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:31,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:32,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:30,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:33,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:31,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:32,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:32,522] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:32,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:37,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:38,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:38,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:39,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:40,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:37,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:38,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:34:39,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:19,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:19,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:19,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:17,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:19,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:20,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:20,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:21,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:26,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:26,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:27,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:27,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:28,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:27,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:28,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:35:29,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 07:36:06,870 (deepspeed_trainer:228) INFO: 21epoch:train:9301-9400batch: iter_time=2.515, loss_ctc=72.965, loss_att=58.944, acc=0.694, loss=63.161, grad_norm=4.844, loss_scale=1.000, learning_rate=8.808e-05, step_time=0.416 [cnode7-012:0/16] 2024-12-07 07:36:44,289 (deepspeed_trainer:228) INFO: 21epoch:train:9401-9500batch: iter_time=1.049e-04, loss_ctc=75.094, loss_att=63.031, acc=0.692, loss=66.632, grad_norm=5.016, loss_scale=1.000, learning_rate=8.807e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 07:37:21,116 (deepspeed_trainer:228) INFO: 21epoch:train:9501-9600batch: iter_time=1.079e-04, loss_ctc=66.333, loss_att=52.439, acc=0.694, loss=56.593, grad_norm=4.591, loss_scale=1.000, learning_rate=8.805e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 07:37:57,989 (deepspeed_trainer:228) INFO: 21epoch:train:9601-9700batch: iter_time=1.076e-04, loss_ctc=74.621, loss_att=56.208, acc=0.696, loss=61.751, grad_norm=4.676, loss_scale=1.000, learning_rate=8.804e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 07:38:34,815 (deepspeed_trainer:228) INFO: 21epoch:train:9701-9800batch: iter_time=1.076e-04, loss_ctc=64.851, loss_att=47.748, acc=0.706, loss=52.881, grad_norm=4.153, loss_scale=1.000, learning_rate=8.802e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 07:39:11,900 (deepspeed_trainer:228) INFO: 21epoch:train:9801-9900batch: iter_time=1.071e-04, loss_ctc=72.906, loss_att=54.499, acc=0.718, loss=60.033, grad_norm=4.594, loss_scale=1.000, learning_rate=8.801e-05, step_time=0.371 [2024-12-07 07:39:48,904] [INFO] [logging.py:129:log_dist] [Rank 0] step=310000, skipped=0, lr=[np.float64(8.79881270965924e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:39:48,905] [INFO] [timer.py:264:stop] epoch=0/micro_step=55000/global_step=55000, RunningAvgSamplesPerSec=43.869140923270464, CurrSamplesPerSec=44.605717108741665, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:39:48,906 (deepspeed_trainer:228) INFO: 21epoch:train:9901-10000batch: iter_time=1.106e-04, loss_ctc=74.660, loss_att=55.170, acc=0.702, loss=61.013, grad_norm=4.925, loss_scale=1.000, learning_rate=8.800e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:40:26,018 (deepspeed_trainer:228) INFO: 21epoch:train:10001-10100batch: iter_time=1.080e-04, loss_ctc=78.225, loss_att=60.759, acc=0.701, loss=65.976, grad_norm=4.772, loss_scale=1.000, learning_rate=8.798e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 07:41:03,094 (deepspeed_trainer:228) INFO: 21epoch:train:10101-10200batch: iter_time=1.093e-04, loss_ctc=74.063, loss_att=54.310, acc=0.710, loss=60.246, grad_norm=4.867, loss_scale=1.000, learning_rate=8.797e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:41:40,119 (deepspeed_trainer:228) INFO: 21epoch:train:10201-10300batch: iter_time=1.072e-04, loss_ctc=76.842, loss_att=59.227, acc=0.695, loss=64.540, grad_norm=5.043, loss_scale=1.000, learning_rate=8.795e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:42:17,352 (deepspeed_trainer:228) INFO: 21epoch:train:10301-10400batch: iter_time=1.101e-04, loss_ctc=78.538, loss_att=53.058, acc=0.701, loss=60.693, grad_norm=4.842, loss_scale=1.000, learning_rate=8.794e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:42:54,595 (deepspeed_trainer:228) INFO: 21epoch:train:10401-10500batch: iter_time=1.065e-04, loss_ctc=74.524, loss_att=55.267, acc=0.695, loss=61.040, grad_norm=4.864, loss_scale=1.000, learning_rate=8.792e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:43:31,928 (deepspeed_trainer:228) INFO: 21epoch:train:10501-10600batch: iter_time=1.098e-04, loss_ctc=77.256, loss_att=59.469, acc=0.702, loss=64.808, grad_norm=4.611, loss_scale=1.000, learning_rate=8.791e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:44:08,989 (deepspeed_trainer:228) INFO: 21epoch:train:10601-10700batch: iter_time=1.067e-04, loss_ctc=69.151, loss_att=49.953, acc=0.716, loss=55.718, grad_norm=4.892, loss_scale=1.000, learning_rate=8.790e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:44:46,533 (deepspeed_trainer:228) INFO: 21epoch:train:10701-10800batch: iter_time=1.098e-04, loss_ctc=78.848, loss_att=57.401, acc=0.704, loss=63.859, grad_norm=4.669, loss_scale=1.000, learning_rate=8.788e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 07:45:25,060 (deepspeed_trainer:228) INFO: 21epoch:train:10801-10900batch: iter_time=1.061e-04, loss_ctc=74.344, loss_att=57.943, acc=0.700, loss=62.873, grad_norm=4.502, loss_scale=1.000, learning_rate=8.787e-05, step_time=0.385 [2024-12-07 07:46:03,188] [INFO] [logging.py:129:log_dist] [Rank 0] step=311000, skipped=0, lr=[np.float64(8.784655364374657e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:46:03,188] [INFO] [timer.py:264:stop] epoch=0/micro_step=56000/global_step=56000, RunningAvgSamplesPerSec=43.868793742606684, CurrSamplesPerSec=41.92969602351024, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:46:03,190 (deepspeed_trainer:228) INFO: 21epoch:train:10901-11000batch: iter_time=1.085e-04, loss_ctc=68.310, loss_att=52.646, acc=0.721, loss=57.356, grad_norm=4.424, loss_scale=1.000, learning_rate=8.785e-05, step_time=0.381 [cnode7-012:0/16] 2024-12-07 07:46:41,151 (deepspeed_trainer:228) INFO: 21epoch:train:11001-11100batch: iter_time=1.103e-04, loss_ctc=71.496, loss_att=54.687, acc=0.695, loss=59.734, grad_norm=5.307, loss_scale=1.000, learning_rate=8.784e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 07:47:18,728 (deepspeed_trainer:228) INFO: 21epoch:train:11101-11200batch: iter_time=1.063e-04, loss_ctc=82.310, loss_att=62.258, acc=0.703, loss=68.239, grad_norm=4.604, loss_scale=1.000, learning_rate=8.783e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:47:41,526 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 07:48:08,788 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 07:48:24,410 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 07:48:24,410 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 07:48:24,412 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 07:48:48,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:48,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:49,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:46,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:48,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:51,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:48,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:49,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:49,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:49,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:49,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:49,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:53,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:54,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:55,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:48:59,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:37,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:37,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:35,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:36,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:37,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:37,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:40,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:41,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:38,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:38,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:38,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:38,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:43,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:44,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:44,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:49:49,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:25,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:23,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:26,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:24,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:25,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:25,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:26,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:26,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:30,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:27,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:27,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:32,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:33,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:33,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:34,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:50:38,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:12,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:13,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:11,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:13,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:13,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:13,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:14,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:14,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:14,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:18,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:21,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:18,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:21,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:22,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:23,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 07:51:26,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 07:52:11,386 (deepspeed_trainer:228) INFO: 21epoch:train:11201-11300batch: iter_time=2.471, loss_ctc=75.691, loss_att=66.065, acc=0.688, loss=68.947, grad_norm=4.687, loss_scale=1.000, learning_rate=8.781e-05, step_time=0.455 [cnode7-012:0/16] 2024-12-07 07:52:49,646 (deepspeed_trainer:228) INFO: 21epoch:train:11301-11400batch: iter_time=1.130e-04, loss_ctc=65.092, loss_att=55.944, acc=0.693, loss=58.706, grad_norm=4.379, loss_scale=1.000, learning_rate=8.780e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 07:53:27,156 (deepspeed_trainer:228) INFO: 21epoch:train:11401-11500batch: iter_time=1.090e-04, loss_ctc=74.982, loss_att=57.637, acc=0.710, loss=62.861, grad_norm=4.747, loss_scale=1.000, learning_rate=8.778e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 07:54:04,464 (deepspeed_trainer:228) INFO: 21epoch:train:11501-11600batch: iter_time=1.124e-04, loss_ctc=69.871, loss_att=52.962, acc=0.698, loss=58.036, grad_norm=4.772, loss_scale=1.000, learning_rate=8.777e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 07:54:41,705 (deepspeed_trainer:228) INFO: 21epoch:train:11601-11700batch: iter_time=1.125e-04, loss_ctc=65.589, loss_att=48.733, acc=0.711, loss=53.780, grad_norm=4.456, loss_scale=1.000, learning_rate=8.775e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 07:55:19,387 (deepspeed_trainer:228) INFO: 21epoch:train:11701-11800batch: iter_time=1.126e-04, loss_ctc=72.963, loss_att=57.541, acc=0.718, loss=62.185, grad_norm=4.695, loss_scale=1.000, learning_rate=8.774e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 07:55:57,126 (deepspeed_trainer:228) INFO: 21epoch:train:11801-11900batch: iter_time=1.150e-04, loss_ctc=80.917, loss_att=63.190, acc=0.704, loss=68.493, grad_norm=5.040, loss_scale=1.000, learning_rate=8.773e-05, step_time=0.377 [2024-12-07 07:56:34,572] [INFO] [logging.py:129:log_dist] [Rank 0] step=312000, skipped=0, lr=[np.float64(8.770566137687104e-05)], mom=[[0.9, 0.98]] [2024-12-07 07:56:34,572] [INFO] [timer.py:264:stop] epoch=0/micro_step=57000/global_step=57000, RunningAvgSamplesPerSec=43.846767382471526, CurrSamplesPerSec=46.33578238227855, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 07:56:34,574 (deepspeed_trainer:228) INFO: 21epoch:train:11901-12000batch: iter_time=1.135e-04, loss_ctc=74.806, loss_att=57.796, acc=0.701, loss=62.927, grad_norm=4.861, loss_scale=1.000, learning_rate=8.771e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 07:57:12,167 (deepspeed_trainer:228) INFO: 21epoch:train:12001-12100batch: iter_time=1.173e-04, loss_ctc=76.178, loss_att=57.408, acc=0.708, loss=63.051, grad_norm=5.133, loss_scale=1.000, learning_rate=8.770e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 07:57:49,680 (deepspeed_trainer:228) INFO: 21epoch:train:12101-12200batch: iter_time=1.144e-04, loss_ctc=71.984, loss_att=55.627, acc=0.708, loss=60.489, grad_norm=4.094, loss_scale=1.000, learning_rate=8.768e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 07:58:26,774 (deepspeed_trainer:228) INFO: 21epoch:train:12201-12300batch: iter_time=1.133e-04, loss_ctc=83.810, loss_att=55.878, acc=0.703, loss=64.268, grad_norm=5.348, loss_scale=1.000, learning_rate=8.767e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 07:59:03,872 (deepspeed_trainer:228) INFO: 21epoch:train:12301-12400batch: iter_time=1.124e-04, loss_ctc=69.935, loss_att=55.150, acc=0.707, loss=59.598, grad_norm=4.434, loss_scale=1.000, learning_rate=8.766e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 07:59:40,810 (deepspeed_trainer:228) INFO: 21epoch:train:12401-12500batch: iter_time=1.104e-04, loss_ctc=79.218, loss_att=59.398, acc=0.701, loss=65.368, grad_norm=5.274, loss_scale=1.000, learning_rate=8.764e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:00:17,896 (deepspeed_trainer:228) INFO: 21epoch:train:12501-12600batch: iter_time=1.117e-04, loss_ctc=72.203, loss_att=55.992, acc=0.720, loss=60.838, grad_norm=4.945, loss_scale=1.000, learning_rate=8.763e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:00:55,190 (deepspeed_trainer:228) INFO: 21epoch:train:12601-12700batch: iter_time=1.139e-04, loss_ctc=77.573, loss_att=58.916, acc=0.705, loss=64.532, grad_norm=4.954, loss_scale=1.000, learning_rate=8.761e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 08:01:32,306 (deepspeed_trainer:228) INFO: 21epoch:train:12701-12800batch: iter_time=1.122e-04, loss_ctc=64.592, loss_att=48.554, acc=0.718, loss=53.377, grad_norm=4.278, loss_scale=1.000, learning_rate=8.760e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:02:09,976 (deepspeed_trainer:228) INFO: 21epoch:train:12801-12900batch: iter_time=1.120e-04, loss_ctc=72.260, loss_att=57.331, acc=0.714, loss=61.837, grad_norm=4.575, loss_scale=1.000, learning_rate=8.759e-05, step_time=0.376 [2024-12-07 08:02:47,286] [INFO] [logging.py:129:log_dist] [Rank 0] step=313000, skipped=0, lr=[np.float64(8.75654448508434e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:02:47,286] [INFO] [timer.py:264:stop] epoch=0/micro_step=58000/global_step=58000, RunningAvgSamplesPerSec=43.849594169026894, CurrSamplesPerSec=41.014292010577364, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:02:47,288 (deepspeed_trainer:228) INFO: 21epoch:train:12901-13000batch: iter_time=1.112e-04, loss_ctc=76.311, loss_att=60.467, acc=0.700, loss=65.198, grad_norm=5.103, loss_scale=1.000, learning_rate=8.757e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 08:03:24,616 (deepspeed_trainer:228) INFO: 21epoch:train:13001-13100batch: iter_time=1.109e-04, loss_ctc=73.888, loss_att=55.871, acc=0.709, loss=61.298, grad_norm=4.813, loss_scale=1.000, learning_rate=8.756e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 08:03:38,081 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 08:04:05,001 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 08:04:21,388 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 08:04:21,388 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 08:04:21,390 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 08:04:46,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:47,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:45,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:48,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:45,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:46,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:46,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:46,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:46,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:46,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:49,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:50,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:47,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:51,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:51,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:04:52,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:35,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:36,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:33,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:37,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:35,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:35,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:36,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:36,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:36,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:36,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:36,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:40,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:41,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:41,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:42,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:05:43,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:23,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:25,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:22,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:24,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:24,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:24,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:24,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:25,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:28,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:25,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:28,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:29,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:30,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:31,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:29,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:06:32,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:11,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:12,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:09,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:12,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:12,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:16,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:13,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:13,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:14,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:14,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:17,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:18,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:19,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:19,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:21,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:07:19,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 08:08:16,474 (deepspeed_trainer:228) INFO: 21epoch:train:13101-13200batch: iter_time=2.483, loss_ctc=81.873, loss_att=70.710, acc=0.691, loss=74.069, grad_norm=4.905, loss_scale=1.000, learning_rate=8.754e-05, step_time=0.435 [cnode7-012:0/16] 2024-12-07 08:08:53,288 (deepspeed_trainer:228) INFO: 21epoch:train:13201-13300batch: iter_time=1.050e-04, loss_ctc=62.624, loss_att=51.000, acc=0.696, loss=54.454, grad_norm=5.014, loss_scale=1.000, learning_rate=8.753e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 08:09:30,423 (deepspeed_trainer:228) INFO: 21epoch:train:13301-13400batch: iter_time=1.045e-04, loss_ctc=74.018, loss_att=53.701, acc=0.713, loss=59.787, grad_norm=4.382, loss_scale=1.000, learning_rate=8.752e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:10:07,581 (deepspeed_trainer:228) INFO: 21epoch:train:13401-13500batch: iter_time=1.072e-04, loss_ctc=71.077, loss_att=57.648, acc=0.696, loss=61.688, grad_norm=5.193, loss_scale=1.000, learning_rate=8.750e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:10:44,532 (deepspeed_trainer:228) INFO: 21epoch:train:13501-13600batch: iter_time=1.065e-04, loss_ctc=66.549, loss_att=47.789, acc=0.712, loss=53.378, grad_norm=4.649, loss_scale=1.000, learning_rate=8.749e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:11:21,980 (deepspeed_trainer:228) INFO: 21epoch:train:13601-13700batch: iter_time=1.054e-04, loss_ctc=71.857, loss_att=55.340, acc=0.722, loss=60.313, grad_norm=5.205, loss_scale=1.000, learning_rate=8.747e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 08:11:59,667 (deepspeed_trainer:228) INFO: 21epoch:train:13701-13800batch: iter_time=1.068e-04, loss_ctc=81.928, loss_att=61.405, acc=0.705, loss=67.549, grad_norm=5.393, loss_scale=1.000, learning_rate=8.746e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 08:12:37,089 (deepspeed_trainer:228) INFO: 21epoch:train:13801-13900batch: iter_time=1.085e-04, loss_ctc=69.176, loss_att=53.880, acc=0.710, loss=58.491, grad_norm=4.523, loss_scale=1.000, learning_rate=8.745e-05, step_time=0.374 [2024-12-07 08:13:14,662] [INFO] [logging.py:129:log_dist] [Rank 0] step=314000, skipped=0, lr=[np.float64(8.742589868128359e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:13:14,662] [INFO] [timer.py:264:stop] epoch=0/micro_step=59000/global_step=59000, RunningAvgSamplesPerSec=43.83971174599458, CurrSamplesPerSec=41.29226882872615, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:13:14,664 (deepspeed_trainer:228) INFO: 21epoch:train:13901-14000batch: iter_time=1.075e-04, loss_ctc=82.419, loss_att=65.050, acc=0.701, loss=70.256, grad_norm=5.384, loss_scale=1.000, learning_rate=8.743e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 08:13:52,115 (deepspeed_trainer:228) INFO: 21epoch:train:14001-14100batch: iter_time=1.084e-04, loss_ctc=67.672, loss_att=50.015, acc=0.714, loss=55.314, grad_norm=4.506, loss_scale=1.000, learning_rate=8.742e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 08:14:29,568 (deepspeed_trainer:228) INFO: 21epoch:train:14101-14200batch: iter_time=1.113e-04, loss_ctc=84.039, loss_att=58.346, acc=0.696, loss=66.065, grad_norm=5.781, loss_scale=1.000, learning_rate=8.740e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 08:15:07,435 (deepspeed_trainer:228) INFO: 21epoch:train:14201-14300batch: iter_time=1.088e-04, loss_ctc=75.034, loss_att=57.558, acc=0.711, loss=62.814, grad_norm=5.055, loss_scale=1.000, learning_rate=8.739e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 08:15:44,526 (deepspeed_trainer:228) INFO: 21epoch:train:14301-14400batch: iter_time=1.078e-04, loss_ctc=71.897, loss_att=52.781, acc=0.708, loss=58.508, grad_norm=5.117, loss_scale=1.000, learning_rate=8.738e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:16:21,542 (deepspeed_trainer:228) INFO: 21epoch:train:14401-14500batch: iter_time=1.098e-04, loss_ctc=75.230, loss_att=59.345, acc=0.710, loss=64.135, grad_norm=5.500, loss_scale=1.000, learning_rate=8.736e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:16:58,419 (deepspeed_trainer:228) INFO: 21epoch:train:14501-14600batch: iter_time=1.081e-04, loss_ctc=77.255, loss_att=57.070, acc=0.705, loss=63.133, grad_norm=4.722, loss_scale=1.000, learning_rate=8.735e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 08:17:35,225 (deepspeed_trainer:228) INFO: 21epoch:train:14601-14700batch: iter_time=1.072e-04, loss_ctc=66.484, loss_att=49.362, acc=0.727, loss=54.515, grad_norm=3.919, loss_scale=1.000, learning_rate=8.734e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 08:18:12,149 (deepspeed_trainer:228) INFO: 21epoch:train:14701-14800batch: iter_time=1.089e-04, loss_ctc=67.882, loss_att=56.410, acc=0.704, loss=59.864, grad_norm=5.187, loss_scale=1.000, learning_rate=8.732e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:18:49,402 (deepspeed_trainer:228) INFO: 21epoch:train:14801-14900batch: iter_time=1.078e-04, loss_ctc=78.644, loss_att=62.360, acc=0.702, loss=67.261, grad_norm=5.236, loss_scale=1.000, learning_rate=8.731e-05, step_time=0.372 [2024-12-07 08:19:26,284] [INFO] [logging.py:129:log_dist] [Rank 0] step=315000, skipped=0, lr=[np.float64(8.728701754368542e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:19:26,284] [INFO] [timer.py:264:stop] epoch=0/micro_step=60000/global_step=60000, RunningAvgSamplesPerSec=43.84507261735329, CurrSamplesPerSec=43.98999930029549, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:19:26,286 (deepspeed_trainer:228) INFO: 21epoch:train:14901-15000batch: iter_time=1.081e-04, loss_ctc=77.899, loss_att=58.939, acc=0.706, loss=64.626, grad_norm=4.847, loss_scale=1.000, learning_rate=8.729e-05, step_time=0.368 [2024-12-07 08:19:41,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:41,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:42,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:39,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:39,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:39,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:39,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:42,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:42,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:40,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:40,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:43,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:43,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:43,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:40,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:40,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:57,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:55,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:58,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:56,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:56,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:59,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:57,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:00,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:57,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:57,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:57,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:19:57,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:01,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:01,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:01,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:01,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:13,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:11,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:14,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:11,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:12,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:12,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:15,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:12,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:15,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:13,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:13,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:16,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:14,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:17,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:17,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:17,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:28,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:26,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:29,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:27,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:28,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:28,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:31,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:31,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:29,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:32,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:29,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:29,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:32,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:32,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:30,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:33,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:20:46,340] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 21 is about to be saved! [2024-12-07 08:20:46,375] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/mp_rank_00_model_states.pt [2024-12-07 08:20:46,375] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/mp_rank_00_model_states.pt... [2024-12-07 08:20:48,400] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/mp_rank_00_model_states.pt. [2024-12-07 08:20:45,665] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,559] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,669] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,562] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,562] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,671] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,671] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,563] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,564] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 08:20:48,565] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,672] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,673] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,674] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 08:20:45,674] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 08:20:46,439] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,440] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,440] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,339] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,339] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,339] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,455] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,455] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,455] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,355] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,355] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,355] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,467] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,467] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,467] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,364] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,367] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,368] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,368] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,476] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,476] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,371] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,372] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,372] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,486] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,487] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,487] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,379] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,379] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,379] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,501] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,501] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,501] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,419] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,419] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,419] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,527] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,527] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,527] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,420] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,421] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:46,529] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 08:20:46,529] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 08:20:46,529] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [2024-12-07 08:20:49,443] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 08:20:49,444] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_21/21/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 08:20:49,444] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 21 is ready now! [cnode7-012:0/16] 2024-12-07 08:20:49,455 (deepspeed_trainer:158) INFO: 21epoch results: [train] iter_time=0.133, loss_ctc=74.720, loss_att=56.968, acc=0.702, loss=62.297, grad_norm=4.826, loss_scale=1.000, learning_rate=8.835e-05, step_time=0.375, time=2 hours, 7 minutes and 8.94 seconds, total_count=315021, gpu_max_cached_mem_GB=27.986, [valid] loss_ctc=4.844, cer_ctc=0.134, loss_att=8.375, acc=0.789, cer=0.425, wer=1.000, loss=7.312, time=1 minute and 10.74 seconds, total_count=21, gpu_max_cached_mem_GB=27.986 [cnode7-012:0/16] 2024-12-07 08:20:51,369 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 08:21:18,362 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 08:21:34,271 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 08:21:34,271 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 08:21:34,273 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 08:21:52,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:52,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:52,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:53,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:56,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:53,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:53,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:56,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:54,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:57,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:57,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:54,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:57,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:58,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:58,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:21:59,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:40,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:40,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:40,913] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:41,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:41,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:41,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:42,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:45,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:45,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:43,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:47,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:48,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:48,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:48,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:49,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:22:49,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:28,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:28,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:29,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:29,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:29,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:29,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:33,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:30,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:33,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:32,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:36,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:36,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:36,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:37,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:39,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:23:40,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:15,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:16,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:19,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:17,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:17,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:17,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:20,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:18,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:18,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:21,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:25,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:25,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:25,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:25,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:26,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:24:29,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 08:25:32,298 (deepspeed_trainer:228) INFO: 22epoch:train:1-100batch: iter_time=2.414, loss_ctc=69.546, loss_att=52.051, acc=0.702, loss=57.321, grad_norm=4.952, loss_scale=1.000, learning_rate=8.728e-05, step_time=0.395 [cnode7-012:0/16] 2024-12-07 08:26:09,344 (deepspeed_trainer:228) INFO: 22epoch:train:101-200batch: iter_time=1.070e-04, loss_ctc=76.087, loss_att=59.378, acc=0.699, loss=64.403, grad_norm=4.408, loss_scale=1.000, learning_rate=8.727e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:26:46,479 (deepspeed_trainer:228) INFO: 22epoch:train:201-300batch: iter_time=1.064e-04, loss_ctc=80.936, loss_att=59.985, acc=0.695, loss=66.288, grad_norm=4.726, loss_scale=1.000, learning_rate=8.725e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:27:23,511 (deepspeed_trainer:228) INFO: 22epoch:train:301-400batch: iter_time=1.064e-04, loss_ctc=80.708, loss_att=62.901, acc=0.681, loss=68.251, grad_norm=5.197, loss_scale=1.000, learning_rate=8.724e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:28:00,488 (deepspeed_trainer:228) INFO: 22epoch:train:401-500batch: iter_time=1.051e-04, loss_ctc=72.131, loss_att=57.385, acc=0.696, loss=61.822, grad_norm=4.814, loss_scale=1.000, learning_rate=8.722e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:28:37,623 (deepspeed_trainer:228) INFO: 22epoch:train:501-600batch: iter_time=1.097e-04, loss_ctc=76.402, loss_att=58.279, acc=0.688, loss=63.722, grad_norm=4.891, loss_scale=1.000, learning_rate=8.721e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:29:14,653 (deepspeed_trainer:228) INFO: 22epoch:train:601-700batch: iter_time=1.105e-04, loss_ctc=72.130, loss_att=53.439, acc=0.709, loss=59.056, grad_norm=4.817, loss_scale=1.000, learning_rate=8.720e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:29:51,493 (deepspeed_trainer:228) INFO: 22epoch:train:701-800batch: iter_time=1.064e-04, loss_ctc=66.940, loss_att=53.802, acc=0.684, loss=57.739, grad_norm=4.742, loss_scale=1.000, learning_rate=8.718e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 08:30:28,727 (deepspeed_trainer:228) INFO: 22epoch:train:801-900batch: iter_time=1.094e-04, loss_ctc=84.853, loss_att=64.731, acc=0.679, loss=70.793, grad_norm=5.857, loss_scale=1.000, learning_rate=8.717e-05, step_time=0.372 [2024-12-07 08:31:06,027] [INFO] [logging.py:129:log_dist] [Rank 0] step=316000, skipped=0, lr=[np.float64(8.714879617256328e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:31:06,028] [INFO] [timer.py:264:stop] epoch=0/micro_step=61000/global_step=61000, RunningAvgSamplesPerSec=43.84664342510328, CurrSamplesPerSec=42.48417704584297, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:31:06,030 (deepspeed_trainer:228) INFO: 22epoch:train:901-1000batch: iter_time=1.095e-04, loss_ctc=82.894, loss_att=55.513, acc=0.718, loss=63.731, grad_norm=4.749, loss_scale=1.000, learning_rate=8.716e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 08:31:43,148 (deepspeed_trainer:228) INFO: 22epoch:train:1001-1100batch: iter_time=1.092e-04, loss_ctc=67.384, loss_att=52.491, acc=0.684, loss=56.951, grad_norm=4.250, loss_scale=1.000, learning_rate=8.714e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:32:21,558 (deepspeed_trainer:228) INFO: 22epoch:train:1101-1200batch: iter_time=1.164e-04, loss_ctc=71.485, loss_att=53.830, acc=0.693, loss=59.106, grad_norm=4.847, loss_scale=1.000, learning_rate=8.713e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 08:32:58,473 (deepspeed_trainer:228) INFO: 22epoch:train:1201-1300batch: iter_time=1.074e-04, loss_ctc=78.449, loss_att=57.069, acc=0.710, loss=63.483, grad_norm=4.675, loss_scale=1.000, learning_rate=8.711e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:33:35,795 (deepspeed_trainer:228) INFO: 22epoch:train:1301-1400batch: iter_time=1.075e-04, loss_ctc=75.056, loss_att=52.859, acc=0.719, loss=59.524, grad_norm=4.719, loss_scale=1.000, learning_rate=8.710e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 08:34:12,744 (deepspeed_trainer:228) INFO: 22epoch:train:1401-1500batch: iter_time=1.079e-04, loss_ctc=66.278, loss_att=55.261, acc=0.696, loss=58.532, grad_norm=4.833, loss_scale=1.000, learning_rate=8.709e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:34:49,671 (deepspeed_trainer:228) INFO: 22epoch:train:1501-1600batch: iter_time=1.097e-04, loss_ctc=73.528, loss_att=52.503, acc=0.711, loss=58.806, grad_norm=4.384, loss_scale=1.000, learning_rate=8.707e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:35:26,716 (deepspeed_trainer:228) INFO: 22epoch:train:1601-1700batch: iter_time=1.067e-04, loss_ctc=81.041, loss_att=63.993, acc=0.698, loss=69.123, grad_norm=4.856, loss_scale=1.000, learning_rate=8.706e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:36:04,016 (deepspeed_trainer:228) INFO: 22epoch:train:1701-1800batch: iter_time=1.104e-04, loss_ctc=81.882, loss_att=60.608, acc=0.698, loss=66.996, grad_norm=4.873, loss_scale=1.000, learning_rate=8.705e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 08:36:37,221 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 08:37:03,141 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 08:37:20,942 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 08:37:20,942 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 08:37:20,944 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 08:37:42,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:42,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:43,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:43,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:44,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:44,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:45,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:45,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:50,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:50,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:50,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:50,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:50,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:51,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:51,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:37:52,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:31,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:32,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:32,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:32,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:33,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:33,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:34,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:34,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:42,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:42,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:43,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:43,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:44,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:44,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:44,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:38:45,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:20,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:20,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:21,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:21,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:21,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:21,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:22,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:23,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:30,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:31,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:33,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:33,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:33,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:34,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:34,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:39:36,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:08,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:08,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:09,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:09,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:10,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:10,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:10,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:11,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:18,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:19,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:21,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:21,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:22,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:22,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:24,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:40:26,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 08:41:03,019 (deepspeed_trainer:228) INFO: 22epoch:train:1801-1900batch: iter_time=2.585, loss_ctc=78.489, loss_att=62.214, acc=0.696, loss=67.087, grad_norm=5.470, loss_scale=1.000, learning_rate=8.703e-05, step_time=0.405 [2024-12-07 08:41:40,223] [INFO] [logging.py:129:log_dist] [Rank 0] step=317000, skipped=0, lr=[np.float64(8.701122936061366e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:41:40,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=62000/global_step=62000, RunningAvgSamplesPerSec=43.843865468505925, CurrSamplesPerSec=43.7017572877057, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:41:40,227 (deepspeed_trainer:228) INFO: 22epoch:train:1901-2000batch: iter_time=1.152e-04, loss_ctc=69.149, loss_att=49.014, acc=0.718, loss=55.036, grad_norm=5.368, loss_scale=1.000, learning_rate=8.702e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 08:42:17,899 (deepspeed_trainer:228) INFO: 22epoch:train:2001-2100batch: iter_time=1.073e-04, loss_ctc=72.837, loss_att=59.893, acc=0.711, loss=63.782, grad_norm=4.279, loss_scale=1.000, learning_rate=8.700e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 08:42:55,354 (deepspeed_trainer:228) INFO: 22epoch:train:2101-2200batch: iter_time=1.109e-04, loss_ctc=85.318, loss_att=64.007, acc=0.697, loss=70.390, grad_norm=4.750, loss_scale=1.000, learning_rate=8.699e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 08:43:32,566 (deepspeed_trainer:228) INFO: 22epoch:train:2201-2300batch: iter_time=1.089e-04, loss_ctc=73.835, loss_att=56.446, acc=0.694, loss=61.654, grad_norm=5.056, loss_scale=1.000, learning_rate=8.698e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 08:44:10,053 (deepspeed_trainer:228) INFO: 22epoch:train:2301-2400batch: iter_time=1.091e-04, loss_ctc=74.789, loss_att=58.435, acc=0.702, loss=63.337, grad_norm=4.619, loss_scale=1.000, learning_rate=8.696e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 08:44:47,669 (deepspeed_trainer:228) INFO: 22epoch:train:2401-2500batch: iter_time=1.101e-04, loss_ctc=78.359, loss_att=62.018, acc=0.697, loss=66.884, grad_norm=4.487, loss_scale=1.000, learning_rate=8.695e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 08:45:24,794 (deepspeed_trainer:228) INFO: 22epoch:train:2501-2600batch: iter_time=1.111e-04, loss_ctc=67.865, loss_att=50.057, acc=0.719, loss=55.409, grad_norm=4.253, loss_scale=1.000, learning_rate=8.694e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:46:01,711 (deepspeed_trainer:228) INFO: 22epoch:train:2601-2700batch: iter_time=1.107e-04, loss_ctc=71.682, loss_att=56.092, acc=0.682, loss=60.768, grad_norm=5.275, loss_scale=1.000, learning_rate=8.692e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:46:38,920 (deepspeed_trainer:228) INFO: 22epoch:train:2701-2800batch: iter_time=1.084e-04, loss_ctc=77.350, loss_att=59.784, acc=0.705, loss=65.081, grad_norm=5.072, loss_scale=1.000, learning_rate=8.691e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 08:47:16,122 (deepspeed_trainer:228) INFO: 22epoch:train:2801-2900batch: iter_time=1.115e-04, loss_ctc=82.709, loss_att=60.129, acc=0.702, loss=66.887, grad_norm=4.689, loss_scale=1.000, learning_rate=8.689e-05, step_time=0.372 [2024-12-07 08:47:52,870] [INFO] [logging.py:129:log_dist] [Rank 0] step=318000, skipped=0, lr=[np.float64(8.687431195789114e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:47:52,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=63000/global_step=63000, RunningAvgSamplesPerSec=43.84728065339482, CurrSamplesPerSec=46.144172941493764, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:47:52,872 (deepspeed_trainer:228) INFO: 22epoch:train:2901-3000batch: iter_time=1.074e-04, loss_ctc=66.132, loss_att=53.949, acc=0.696, loss=57.605, grad_norm=3.949, loss_scale=1.000, learning_rate=8.688e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 08:48:29,710 (deepspeed_trainer:228) INFO: 22epoch:train:3001-3100batch: iter_time=1.100e-04, loss_ctc=73.696, loss_att=53.553, acc=0.708, loss=59.593, grad_norm=4.566, loss_scale=1.000, learning_rate=8.687e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 08:49:06,826 (deepspeed_trainer:228) INFO: 22epoch:train:3101-3200batch: iter_time=1.105e-04, loss_ctc=78.388, loss_att=55.334, acc=0.715, loss=62.253, grad_norm=4.649, loss_scale=1.000, learning_rate=8.685e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 08:49:43,869 (deepspeed_trainer:228) INFO: 22epoch:train:3201-3300batch: iter_time=1.083e-04, loss_ctc=69.638, loss_att=52.930, acc=0.718, loss=57.924, grad_norm=4.172, loss_scale=1.000, learning_rate=8.684e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:50:20,826 (deepspeed_trainer:228) INFO: 22epoch:train:3301-3400batch: iter_time=1.092e-04, loss_ctc=65.687, loss_att=52.654, acc=0.706, loss=56.554, grad_norm=4.231, loss_scale=1.000, learning_rate=8.683e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 08:50:57,682 (deepspeed_trainer:228) INFO: 22epoch:train:3401-3500batch: iter_time=1.087e-04, loss_ctc=70.453, loss_att=52.643, acc=0.708, loss=57.964, grad_norm=4.063, loss_scale=1.000, learning_rate=8.681e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 08:51:35,216 (deepspeed_trainer:228) INFO: 22epoch:train:3501-3600batch: iter_time=1.096e-04, loss_ctc=91.340, loss_att=70.662, acc=0.714, loss=76.862, grad_norm=4.529, loss_scale=1.000, learning_rate=8.680e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 08:52:12,217 (deepspeed_trainer:228) INFO: 22epoch:train:3601-3700batch: iter_time=1.081e-04, loss_ctc=75.718, loss_att=57.983, acc=0.703, loss=63.299, grad_norm=4.905, loss_scale=1.000, learning_rate=8.679e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 08:52:35,638 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 08:53:02,304 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 08:53:19,860 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 08:53:19,860 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 08:53:19,863 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 08:53:40,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:44,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:44,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:41,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:44,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:41,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:41,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:42,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:46,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:43,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:43,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:43,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:48,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:48,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:48,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:53:49,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:29,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:32,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:29,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:30,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:33,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:33,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:30,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:30,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:30,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:35,751] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:36,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:34,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:38,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:35,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:38,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:54:40,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:19,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:20,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:17,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:17,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:20,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:17,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:18,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:19,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:19,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:24,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:26,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:26,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:27,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:25,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:26,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:55:30,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:04,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:05,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:08,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:05,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:09,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:06,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:09,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:07,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:10,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:14,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:15,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:14,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:17,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:18,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:15,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 08:56:23,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 08:57:09,549 (deepspeed_trainer:228) INFO: 22epoch:train:3701-3800batch: iter_time=2.598, loss_ctc=79.389, loss_att=64.081, acc=0.697, loss=68.661, grad_norm=5.390, loss_scale=1.000, learning_rate=8.677e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 08:57:46,962 (deepspeed_trainer:228) INFO: 22epoch:train:3801-3900batch: iter_time=1.563e-04, loss_ctc=66.974, loss_att=46.306, acc=0.725, loss=52.516, grad_norm=4.966, loss_scale=1.000, learning_rate=8.676e-05, step_time=0.374 [2024-12-07 08:58:24,924] [INFO] [logging.py:129:log_dist] [Rank 0] step=319000, skipped=0, lr=[np.float64(8.673803887099866e-05)], mom=[[0.9, 0.98]] [2024-12-07 08:58:24,925] [INFO] [timer.py:264:stop] epoch=0/micro_step=64000/global_step=64000, RunningAvgSamplesPerSec=43.85172274488181, CurrSamplesPerSec=43.92084522792244, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 08:58:24,927 (deepspeed_trainer:228) INFO: 22epoch:train:3901-4000batch: iter_time=1.457e-04, loss_ctc=75.559, loss_att=58.428, acc=0.713, loss=63.571, grad_norm=4.227, loss_scale=1.000, learning_rate=8.674e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 08:59:02,339 (deepspeed_trainer:228) INFO: 22epoch:train:4001-4100batch: iter_time=1.304e-04, loss_ctc=77.251, loss_att=60.748, acc=0.699, loss=65.696, grad_norm=4.367, loss_scale=1.000, learning_rate=8.673e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 08:59:39,646 (deepspeed_trainer:228) INFO: 22epoch:train:4101-4200batch: iter_time=1.233e-04, loss_ctc=78.363, loss_att=58.483, acc=0.695, loss=64.422, grad_norm=4.680, loss_scale=1.000, learning_rate=8.672e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:00:17,107 (deepspeed_trainer:228) INFO: 22epoch:train:4201-4300batch: iter_time=1.419e-04, loss_ctc=71.680, loss_att=53.648, acc=0.707, loss=59.050, grad_norm=4.412, loss_scale=1.000, learning_rate=8.670e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 09:00:55,025 (deepspeed_trainer:228) INFO: 22epoch:train:4301-4400batch: iter_time=1.214e-04, loss_ctc=79.612, loss_att=63.820, acc=0.699, loss=68.568, grad_norm=5.088, loss_scale=1.000, learning_rate=8.669e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 09:01:32,100 (deepspeed_trainer:228) INFO: 22epoch:train:4401-4500batch: iter_time=1.141e-04, loss_ctc=65.445, loss_att=48.628, acc=0.711, loss=53.673, grad_norm=4.332, loss_scale=1.000, learning_rate=8.668e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 09:02:09,297 (deepspeed_trainer:228) INFO: 22epoch:train:4501-4600batch: iter_time=1.145e-04, loss_ctc=71.378, loss_att=55.289, acc=0.694, loss=60.126, grad_norm=4.998, loss_scale=1.000, learning_rate=8.666e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:02:46,928 (deepspeed_trainer:228) INFO: 22epoch:train:4601-4700batch: iter_time=1.171e-04, loss_ctc=77.461, loss_att=57.970, acc=0.707, loss=63.825, grad_norm=5.304, loss_scale=1.000, learning_rate=8.665e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 09:03:24,609 (deepspeed_trainer:228) INFO: 22epoch:train:4701-4800batch: iter_time=1.177e-04, loss_ctc=79.733, loss_att=59.128, acc=0.698, loss=65.299, grad_norm=5.446, loss_scale=1.000, learning_rate=8.664e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 09:04:02,097 (deepspeed_trainer:228) INFO: 22epoch:train:4801-4900batch: iter_time=1.162e-04, loss_ctc=71.676, loss_att=56.261, acc=0.704, loss=60.899, grad_norm=5.002, loss_scale=1.000, learning_rate=8.662e-05, step_time=0.374 [2024-12-07 09:04:39,784] [INFO] [logging.py:129:log_dist] [Rank 0] step=320000, skipped=0, lr=[np.float64(8.660240506229167e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:04:39,785] [INFO] [timer.py:264:stop] epoch=0/micro_step=65000/global_step=65000, RunningAvgSamplesPerSec=43.85051606073809, CurrSamplesPerSec=43.72604621047783, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:04:39,786 (deepspeed_trainer:228) INFO: 22epoch:train:4901-5000batch: iter_time=1.198e-04, loss_ctc=75.873, loss_att=56.506, acc=0.709, loss=62.338, grad_norm=4.853, loss_scale=1.000, learning_rate=8.661e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 09:05:17,654 (deepspeed_trainer:228) INFO: 22epoch:train:5001-5100batch: iter_time=1.172e-04, loss_ctc=75.595, loss_att=50.919, acc=0.723, loss=58.307, grad_norm=4.976, loss_scale=1.000, learning_rate=8.660e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 09:05:55,045 (deepspeed_trainer:228) INFO: 22epoch:train:5101-5200batch: iter_time=1.139e-04, loss_ctc=65.339, loss_att=54.062, acc=0.711, loss=57.431, grad_norm=4.145, loss_scale=1.000, learning_rate=8.658e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:06:32,216 (deepspeed_trainer:228) INFO: 22epoch:train:5201-5300batch: iter_time=1.128e-04, loss_ctc=67.151, loss_att=49.500, acc=0.715, loss=54.784, grad_norm=4.320, loss_scale=1.000, learning_rate=8.657e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 09:07:09,719 (deepspeed_trainer:228) INFO: 22epoch:train:5301-5400batch: iter_time=1.158e-04, loss_ctc=77.159, loss_att=57.498, acc=0.713, loss=63.405, grad_norm=4.677, loss_scale=1.000, learning_rate=8.656e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:07:47,398 (deepspeed_trainer:228) INFO: 22epoch:train:5401-5500batch: iter_time=1.109e-04, loss_ctc=88.701, loss_att=68.593, acc=0.705, loss=74.614, grad_norm=4.869, loss_scale=1.000, learning_rate=8.654e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 09:08:25,119 (deepspeed_trainer:228) INFO: 22epoch:train:5501-5600batch: iter_time=1.168e-04, loss_ctc=71.392, loss_att=58.958, acc=0.705, loss=62.685, grad_norm=4.875, loss_scale=1.000, learning_rate=8.653e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 09:08:38,465 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 09:09:05,641 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 09:09:23,081 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 09:09:23,081 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 09:09:23,084 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 09:09:43,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:43,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:46,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:44,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:47,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:47,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:45,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:45,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:45,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:46,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:46,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:51,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:51,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:53,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:53,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:09:53,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:31,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:35,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:32,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:36,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:33,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:34,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:34,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:34,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:34,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:34,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:38,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:40,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:41,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:41,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:42,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:10:43,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:22,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:19,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:20,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:24,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:21,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:22,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:22,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:22,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:22,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:29,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:29,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:29,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:27,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:31,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:32,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:11:32,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:06,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:11,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:09,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:10,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:10,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:11,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:11,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:11,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:15,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:18,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:19,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:20,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:20,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:17,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:21,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:12:22,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 09:13:16,788 (deepspeed_trainer:228) INFO: 22epoch:train:5601-5700batch: iter_time=2.542, loss_ctc=73.254, loss_att=55.727, acc=0.705, loss=60.979, grad_norm=4.914, loss_scale=1.000, learning_rate=8.651e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 09:13:54,082 (deepspeed_trainer:228) INFO: 22epoch:train:5701-5800batch: iter_time=1.049e-04, loss_ctc=69.676, loss_att=52.734, acc=0.710, loss=57.792, grad_norm=4.348, loss_scale=1.000, learning_rate=8.650e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:14:31,713 (deepspeed_trainer:228) INFO: 22epoch:train:5801-5900batch: iter_time=1.070e-04, loss_ctc=78.021, loss_att=59.547, acc=0.702, loss=65.068, grad_norm=4.526, loss_scale=1.000, learning_rate=8.649e-05, step_time=0.376 [2024-12-07 09:15:09,003] [INFO] [logging.py:129:log_dist] [Rank 0] step=321000, skipped=0, lr=[np.float64(8.646740554909596e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:15:09,004] [INFO] [timer.py:264:stop] epoch=0/micro_step=66000/global_step=66000, RunningAvgSamplesPerSec=43.84882398204545, CurrSamplesPerSec=43.60515416861215, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:15:09,005 (deepspeed_trainer:228) INFO: 22epoch:train:5901-6000batch: iter_time=1.089e-04, loss_ctc=77.429, loss_att=59.237, acc=0.697, loss=64.696, grad_norm=4.445, loss_scale=1.000, learning_rate=8.647e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:15:46,303 (deepspeed_trainer:228) INFO: 22epoch:train:6001-6100batch: iter_time=1.099e-04, loss_ctc=74.162, loss_att=57.944, acc=0.695, loss=62.818, grad_norm=4.818, loss_scale=1.000, learning_rate=8.646e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:16:23,674 (deepspeed_trainer:228) INFO: 22epoch:train:6101-6200batch: iter_time=1.110e-04, loss_ctc=73.462, loss_att=53.806, acc=0.698, loss=59.700, grad_norm=4.640, loss_scale=1.000, learning_rate=8.645e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:17:01,155 (deepspeed_trainer:228) INFO: 22epoch:train:6201-6300batch: iter_time=1.151e-04, loss_ctc=72.412, loss_att=56.613, acc=0.706, loss=61.363, grad_norm=4.434, loss_scale=1.000, learning_rate=8.643e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:17:38,682 (deepspeed_trainer:228) INFO: 22epoch:train:6301-6400batch: iter_time=1.145e-04, loss_ctc=71.107, loss_att=57.793, acc=0.685, loss=61.785, grad_norm=4.631, loss_scale=1.000, learning_rate=8.642e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:18:16,225 (deepspeed_trainer:228) INFO: 22epoch:train:6401-6500batch: iter_time=1.082e-04, loss_ctc=71.935, loss_att=53.961, acc=0.698, loss=59.363, grad_norm=4.899, loss_scale=1.000, learning_rate=8.641e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:18:54,073 (deepspeed_trainer:228) INFO: 22epoch:train:6501-6600batch: iter_time=1.064e-04, loss_ctc=84.066, loss_att=61.639, acc=0.701, loss=68.353, grad_norm=4.542, loss_scale=1.000, learning_rate=8.639e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 09:19:31,166 (deepspeed_trainer:228) INFO: 22epoch:train:6601-6700batch: iter_time=1.084e-04, loss_ctc=69.485, loss_att=52.474, acc=0.696, loss=57.561, grad_norm=4.421, loss_scale=1.000, learning_rate=8.638e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 09:20:08,277 (deepspeed_trainer:228) INFO: 22epoch:train:6701-6800batch: iter_time=1.091e-04, loss_ctc=68.856, loss_att=53.494, acc=0.695, loss=58.101, grad_norm=4.617, loss_scale=1.000, learning_rate=8.637e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 09:20:45,379 (deepspeed_trainer:228) INFO: 22epoch:train:6801-6900batch: iter_time=1.057e-04, loss_ctc=78.135, loss_att=56.549, acc=0.713, loss=63.065, grad_norm=4.770, loss_scale=1.000, learning_rate=8.635e-05, step_time=0.371 [2024-12-07 09:21:22,508] [INFO] [logging.py:129:log_dist] [Rank 0] step=322000, skipped=0, lr=[np.float64(8.633303540293883e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:21:22,509] [INFO] [timer.py:264:stop] epoch=0/micro_step=67000/global_step=67000, RunningAvgSamplesPerSec=43.85044644523786, CurrSamplesPerSec=42.79879674339838, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:21:22,510 (deepspeed_trainer:228) INFO: 22epoch:train:6901-7000batch: iter_time=1.067e-04, loss_ctc=74.935, loss_att=52.247, acc=0.722, loss=59.039, grad_norm=4.407, loss_scale=1.000, learning_rate=8.634e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 09:21:59,767 (deepspeed_trainer:228) INFO: 22epoch:train:7001-7100batch: iter_time=1.073e-04, loss_ctc=65.081, loss_att=55.235, acc=0.702, loss=58.204, grad_norm=3.981, loss_scale=1.000, learning_rate=8.633e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:22:36,860 (deepspeed_trainer:228) INFO: 22epoch:train:7101-7200batch: iter_time=1.073e-04, loss_ctc=67.878, loss_att=47.813, acc=0.717, loss=53.849, grad_norm=4.562, loss_scale=1.000, learning_rate=8.631e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 09:23:14,336 (deepspeed_trainer:228) INFO: 22epoch:train:7201-7300batch: iter_time=1.080e-04, loss_ctc=79.991, loss_att=61.717, acc=0.700, loss=67.234, grad_norm=4.686, loss_scale=1.000, learning_rate=8.630e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 09:23:51,885 (deepspeed_trainer:228) INFO: 22epoch:train:7301-7400batch: iter_time=1.074e-04, loss_ctc=84.704, loss_att=65.859, acc=0.697, loss=71.534, grad_norm=4.704, loss_scale=1.000, learning_rate=8.629e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:24:28,970 (deepspeed_trainer:228) INFO: 22epoch:train:7401-7500batch: iter_time=1.058e-04, loss_ctc=74.516, loss_att=56.667, acc=0.712, loss=62.031, grad_norm=4.868, loss_scale=1.000, learning_rate=8.627e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 09:24:33,414 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 09:25:00,969 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 09:25:16,589 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 09:25:16,589 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 09:25:16,592 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 09:25:40,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:41,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:41,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:39,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:39,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:40,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:40,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:40,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:40,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:44,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:41,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:42,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:45,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:45,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:46,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:25:46,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:29,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:29,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:29,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:27,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:28,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:29,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:29,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:29,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:30,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:30,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:34,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:34,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:36,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:36,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:37,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:26:36,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:16,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:17,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:17,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:15,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:16,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:17,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:17,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:18,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:18,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:19,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:22,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:24,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:24,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:25,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:27,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:27:29,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:04,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:05,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:05,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:03,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:04,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:05,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:06,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:06,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:06,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:08,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:12,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:13,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:14,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:14,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:16,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:28:19,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 09:29:24,720 (deepspeed_trainer:228) INFO: 22epoch:train:7501-7600batch: iter_time=2.509, loss_ctc=68.379, loss_att=51.572, acc=0.710, loss=56.641, grad_norm=4.479, loss_scale=1.000, learning_rate=8.626e-05, step_time=0.448 [cnode7-012:0/16] 2024-12-07 09:30:02,398 (deepspeed_trainer:228) INFO: 22epoch:train:7601-7700batch: iter_time=1.108e-04, loss_ctc=74.362, loss_att=56.032, acc=0.709, loss=61.543, grad_norm=4.113, loss_scale=1.000, learning_rate=8.625e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 09:30:39,959 (deepspeed_trainer:228) INFO: 22epoch:train:7701-7800batch: iter_time=1.071e-04, loss_ctc=80.133, loss_att=58.959, acc=0.700, loss=65.289, grad_norm=4.683, loss_scale=1.000, learning_rate=8.623e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:31:17,215 (deepspeed_trainer:228) INFO: 22epoch:train:7801-7900batch: iter_time=1.087e-04, loss_ctc=79.078, loss_att=61.172, acc=0.688, loss=66.521, grad_norm=5.146, loss_scale=1.000, learning_rate=8.622e-05, step_time=0.372 [2024-12-07 09:31:54,297] [INFO] [logging.py:129:log_dist] [Rank 0] step=323000, skipped=0, lr=[np.float64(8.619928974879335e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:31:54,298] [INFO] [timer.py:264:stop] epoch=0/micro_step=68000/global_step=68000, RunningAvgSamplesPerSec=43.8385063311696, CurrSamplesPerSec=46.27724550056001, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:31:54,299 (deepspeed_trainer:228) INFO: 22epoch:train:7901-8000batch: iter_time=1.058e-04, loss_ctc=70.713, loss_att=55.833, acc=0.703, loss=60.287, grad_norm=4.085, loss_scale=1.000, learning_rate=8.621e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 09:32:31,247 (deepspeed_trainer:228) INFO: 22epoch:train:8001-8100batch: iter_time=1.075e-04, loss_ctc=74.897, loss_att=56.481, acc=0.694, loss=62.018, grad_norm=4.457, loss_scale=1.000, learning_rate=8.619e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 09:33:08,198 (deepspeed_trainer:228) INFO: 22epoch:train:8101-8200batch: iter_time=1.101e-04, loss_ctc=70.859, loss_att=52.396, acc=0.715, loss=57.927, grad_norm=4.208, loss_scale=1.000, learning_rate=8.618e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 09:33:44,825 (deepspeed_trainer:228) INFO: 22epoch:train:8201-8300batch: iter_time=1.076e-04, loss_ctc=64.379, loss_att=51.987, acc=0.693, loss=55.724, grad_norm=4.511, loss_scale=1.000, learning_rate=8.617e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 09:34:22,017 (deepspeed_trainer:228) INFO: 22epoch:train:8301-8400batch: iter_time=1.079e-04, loss_ctc=79.926, loss_att=62.542, acc=0.687, loss=67.749, grad_norm=5.210, loss_scale=1.000, learning_rate=8.615e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 09:34:59,045 (deepspeed_trainer:228) INFO: 22epoch:train:8401-8500batch: iter_time=1.105e-04, loss_ctc=81.644, loss_att=54.402, acc=0.722, loss=62.584, grad_norm=4.728, loss_scale=1.000, learning_rate=8.614e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 09:35:35,958 (deepspeed_trainer:228) INFO: 22epoch:train:8501-8600batch: iter_time=1.074e-04, loss_ctc=65.539, loss_att=50.835, acc=0.690, loss=55.275, grad_norm=4.321, loss_scale=1.000, learning_rate=8.613e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 09:36:12,758 (deepspeed_trainer:228) INFO: 22epoch:train:8601-8700batch: iter_time=1.082e-04, loss_ctc=69.019, loss_att=52.611, acc=0.699, loss=57.547, grad_norm=4.760, loss_scale=1.000, learning_rate=8.611e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 09:36:49,980 (deepspeed_trainer:228) INFO: 22epoch:train:8701-8800batch: iter_time=1.098e-04, loss_ctc=77.338, loss_att=55.983, acc=0.716, loss=62.371, grad_norm=4.838, loss_scale=1.000, learning_rate=8.610e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:37:26,982 (deepspeed_trainer:228) INFO: 22epoch:train:8801-8900batch: iter_time=1.075e-04, loss_ctc=73.912, loss_att=52.200, acc=0.721, loss=58.705, grad_norm=4.458, loss_scale=1.000, learning_rate=8.609e-05, step_time=0.370 [2024-12-07 09:38:03,730] [INFO] [logging.py:129:log_dist] [Rank 0] step=324000, skipped=0, lr=[np.float64(8.606616376433557e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:38:03,730] [INFO] [timer.py:264:stop] epoch=0/micro_step=69000/global_step=69000, RunningAvgSamplesPerSec=43.84682984654576, CurrSamplesPerSec=43.68457487818257, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:38:03,732 (deepspeed_trainer:228) INFO: 22epoch:train:8901-9000batch: iter_time=1.083e-04, loss_ctc=65.441, loss_att=53.932, acc=0.701, loss=57.383, grad_norm=4.252, loss_scale=1.000, learning_rate=8.607e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 09:38:40,448 (deepspeed_trainer:228) INFO: 22epoch:train:9001-9100batch: iter_time=1.091e-04, loss_ctc=71.963, loss_att=51.913, acc=0.715, loss=57.936, grad_norm=4.436, loss_scale=1.000, learning_rate=8.606e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 09:39:17,440 (deepspeed_trainer:228) INFO: 22epoch:train:9101-9200batch: iter_time=1.081e-04, loss_ctc=79.179, loss_att=62.568, acc=0.703, loss=67.571, grad_norm=4.908, loss_scale=1.000, learning_rate=8.605e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 09:39:54,416 (deepspeed_trainer:228) INFO: 22epoch:train:9201-9300batch: iter_time=1.076e-04, loss_ctc=80.271, loss_att=59.824, acc=0.702, loss=65.926, grad_norm=4.688, loss_scale=1.000, learning_rate=8.603e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 09:40:26,496 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 09:40:53,464 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 09:41:10,041 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 09:41:10,041 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 09:41:10,043 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 09:41:33,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:34,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:34,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:31,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:32,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:32,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:33,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:33,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:34,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:34,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:34,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:38,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:39,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:39,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:39,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:41:40,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:22,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:24,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:21,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:24,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:22,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:22,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:22,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:22,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:23,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:23,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:24,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:29,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:30,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:30,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:30,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:42:31,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:09,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:12,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:09,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:10,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:10,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:14,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:11,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:12,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:17,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:15,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:15,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:19,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:20,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:20,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:20,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:20,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:00,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:57,231] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:57,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:59,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:43:59,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:00,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:00,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:06,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:07,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:05,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:06,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:09,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:10,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:10,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:10,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:44:10,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 09:44:48,134 (deepspeed_trainer:228) INFO: 22epoch:train:9301-9400batch: iter_time=2.560, loss_ctc=75.292, loss_att=60.347, acc=0.704, loss=64.814, grad_norm=4.811, loss_scale=1.000, learning_rate=8.602e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 09:45:25,686 (deepspeed_trainer:228) INFO: 22epoch:train:9401-9500batch: iter_time=1.029e-04, loss_ctc=68.474, loss_att=48.726, acc=0.721, loss=54.686, grad_norm=4.807, loss_scale=1.000, learning_rate=8.601e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:46:03,288 (deepspeed_trainer:228) INFO: 22epoch:train:9501-9600batch: iter_time=1.036e-04, loss_ctc=72.892, loss_att=60.190, acc=0.713, loss=64.005, grad_norm=4.078, loss_scale=1.000, learning_rate=8.599e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 09:46:40,802 (deepspeed_trainer:228) INFO: 22epoch:train:9601-9700batch: iter_time=1.048e-04, loss_ctc=84.774, loss_att=63.410, acc=0.701, loss=69.855, grad_norm=4.788, loss_scale=1.000, learning_rate=8.598e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 09:47:18,009 (deepspeed_trainer:228) INFO: 22epoch:train:9701-9800batch: iter_time=1.078e-04, loss_ctc=71.649, loss_att=55.424, acc=0.701, loss=60.278, grad_norm=4.740, loss_scale=1.000, learning_rate=8.597e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:47:55,258 (deepspeed_trainer:228) INFO: 22epoch:train:9801-9900batch: iter_time=1.071e-04, loss_ctc=74.048, loss_att=58.114, acc=0.705, loss=62.903, grad_norm=4.361, loss_scale=1.000, learning_rate=8.595e-05, step_time=0.372 [2024-12-07 09:48:32,879] [INFO] [logging.py:129:log_dist] [Rank 0] step=325000, skipped=0, lr=[np.float64(8.593365267921413e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:48:32,880] [INFO] [timer.py:264:stop] epoch=0/micro_step=70000/global_step=70000, RunningAvgSamplesPerSec=43.847893611073125, CurrSamplesPerSec=41.79843845098127, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:48:32,882 (deepspeed_trainer:228) INFO: 22epoch:train:9901-10000batch: iter_time=1.083e-04, loss_ctc=77.634, loss_att=61.067, acc=0.703, loss=66.013, grad_norm=4.770, loss_scale=1.000, learning_rate=8.594e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 09:49:10,559 (deepspeed_trainer:228) INFO: 22epoch:train:10001-10100batch: iter_time=1.079e-04, loss_ctc=66.975, loss_att=49.399, acc=0.722, loss=54.688, grad_norm=4.498, loss_scale=1.000, learning_rate=8.593e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 09:49:48,039 (deepspeed_trainer:228) INFO: 22epoch:train:10101-10200batch: iter_time=1.072e-04, loss_ctc=69.055, loss_att=54.564, acc=0.689, loss=58.897, grad_norm=5.029, loss_scale=1.000, learning_rate=8.591e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 09:50:25,922 (deepspeed_trainer:228) INFO: 22epoch:train:10201-10300batch: iter_time=1.129e-04, loss_ctc=75.776, loss_att=59.228, acc=0.706, loss=64.205, grad_norm=4.669, loss_scale=1.000, learning_rate=8.590e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 09:51:03,620 (deepspeed_trainer:228) INFO: 22epoch:train:10301-10400batch: iter_time=1.062e-04, loss_ctc=80.728, loss_att=59.050, acc=0.708, loss=65.561, grad_norm=4.490, loss_scale=1.000, learning_rate=8.589e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 09:51:41,375 (deepspeed_trainer:228) INFO: 22epoch:train:10401-10500batch: iter_time=1.075e-04, loss_ctc=65.256, loss_att=53.481, acc=0.700, loss=56.993, grad_norm=4.322, loss_scale=1.000, learning_rate=8.587e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 09:52:18,757 (deepspeed_trainer:228) INFO: 22epoch:train:10501-10600batch: iter_time=1.063e-04, loss_ctc=72.219, loss_att=52.804, acc=0.712, loss=58.635, grad_norm=4.907, loss_scale=1.000, learning_rate=8.586e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:52:56,078 (deepspeed_trainer:228) INFO: 22epoch:train:10601-10700batch: iter_time=1.079e-04, loss_ctc=77.083, loss_att=54.764, acc=0.717, loss=61.437, grad_norm=5.030, loss_scale=1.000, learning_rate=8.585e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 09:53:33,284 (deepspeed_trainer:228) INFO: 22epoch:train:10701-10800batch: iter_time=1.081e-04, loss_ctc=68.913, loss_att=52.247, acc=0.722, loss=57.224, grad_norm=4.316, loss_scale=1.000, learning_rate=8.583e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:54:10,256 (deepspeed_trainer:228) INFO: 22epoch:train:10801-10900batch: iter_time=1.103e-04, loss_ctc=64.938, loss_att=51.641, acc=0.710, loss=55.646, grad_norm=4.230, loss_scale=1.000, learning_rate=8.582e-05, step_time=0.369 [2024-12-07 09:54:47,455] [INFO] [logging.py:129:log_dist] [Rank 0] step=326000, skipped=0, lr=[np.float64(8.580175177433238e-05)], mom=[[0.9, 0.98]] [2024-12-07 09:54:47,456] [INFO] [timer.py:264:stop] epoch=0/micro_step=71000/global_step=71000, RunningAvgSamplesPerSec=43.84669284048369, CurrSamplesPerSec=44.727607694830624, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 09:54:47,457 (deepspeed_trainer:228) INFO: 22epoch:train:10901-11000batch: iter_time=1.059e-04, loss_ctc=69.663, loss_att=51.789, acc=0.711, loss=57.180, grad_norm=4.599, loss_scale=1.000, learning_rate=8.581e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 09:55:25,261 (deepspeed_trainer:228) INFO: 22epoch:train:11001-11100batch: iter_time=1.095e-04, loss_ctc=90.420, loss_att=70.087, acc=0.717, loss=76.175, grad_norm=4.444, loss_scale=1.000, learning_rate=8.580e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 09:56:02,750 (deepspeed_trainer:228) INFO: 22epoch:train:11101-11200batch: iter_time=1.073e-04, loss_ctc=73.789, loss_att=56.557, acc=0.708, loss=61.726, grad_norm=5.049, loss_scale=1.000, learning_rate=8.578e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 09:56:26,236 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 09:56:53,675 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 09:57:10,557 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 09:57:10,557 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 09:57:10,559 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 09:57:34,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:30,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:34,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:35,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:31,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:32,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:32,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:33,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:37,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:33,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:33,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:34,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:38,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:38,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:38,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:57:40,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:18,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:19,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:19,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:23,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:23,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:24,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:20,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:21,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:22,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:22,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:22,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:27,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:27,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:28,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:28,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:58:29,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:06,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:11,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:08,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:12,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:12,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:08,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:09,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:09,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:10,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:10,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:11,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:15,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:16,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:18,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:19,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:20,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:53,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:56,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:56,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:00,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:00,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:01,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:57,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:57,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:58,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 09:59:59,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:00,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:05,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:05,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:07,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:08,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:00:11,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 10:00:57,421 (deepspeed_trainer:228) INFO: 22epoch:train:11201-11300batch: iter_time=2.519, loss_ctc=77.387, loss_att=64.526, acc=0.697, loss=68.397, grad_norm=5.314, loss_scale=1.000, learning_rate=8.577e-05, step_time=0.427 [cnode7-012:0/16] 2024-12-07 10:01:35,152 (deepspeed_trainer:228) INFO: 22epoch:train:11301-11400batch: iter_time=1.245e-04, loss_ctc=66.540, loss_att=46.581, acc=0.721, loss=52.590, grad_norm=4.135, loss_scale=1.000, learning_rate=8.576e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 10:02:12,105 (deepspeed_trainer:228) INFO: 22epoch:train:11401-11500batch: iter_time=1.294e-04, loss_ctc=74.376, loss_att=58.087, acc=0.708, loss=62.955, grad_norm=4.130, loss_scale=1.000, learning_rate=8.574e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 10:02:50,012 (deepspeed_trainer:228) INFO: 22epoch:train:11501-11600batch: iter_time=1.598e-04, loss_ctc=76.625, loss_att=58.708, acc=0.699, loss=64.066, grad_norm=4.594, loss_scale=1.000, learning_rate=8.573e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 10:03:28,104 (deepspeed_trainer:228) INFO: 22epoch:train:11601-11700batch: iter_time=1.493e-04, loss_ctc=75.230, loss_att=56.492, acc=0.695, loss=62.118, grad_norm=4.459, loss_scale=1.000, learning_rate=8.572e-05, step_time=0.381 [cnode7-012:0/16] 2024-12-07 10:04:05,410 (deepspeed_trainer:228) INFO: 22epoch:train:11701-11800batch: iter_time=1.192e-04, loss_ctc=71.129, loss_att=52.790, acc=0.707, loss=58.313, grad_norm=4.456, loss_scale=1.000, learning_rate=8.570e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:04:42,822 (deepspeed_trainer:228) INFO: 22epoch:train:11801-11900batch: iter_time=1.148e-04, loss_ctc=79.505, loss_att=62.929, acc=0.694, loss=67.912, grad_norm=4.488, loss_scale=1.000, learning_rate=8.569e-05, step_time=0.374 [2024-12-07 10:05:19,831] [INFO] [logging.py:129:log_dist] [Rank 0] step=327000, skipped=0, lr=[np.float64(8.567045638114239e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:05:19,832] [INFO] [timer.py:264:stop] epoch=0/micro_step=72000/global_step=72000, RunningAvgSamplesPerSec=43.83671576298019, CurrSamplesPerSec=43.337584018829716, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:05:19,834 (deepspeed_trainer:228) INFO: 22epoch:train:11901-12000batch: iter_time=1.147e-04, loss_ctc=64.100, loss_att=48.354, acc=0.709, loss=53.076, grad_norm=4.317, loss_scale=1.000, learning_rate=8.568e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 10:05:57,012 (deepspeed_trainer:228) INFO: 22epoch:train:12001-12100batch: iter_time=1.131e-04, loss_ctc=70.627, loss_att=55.282, acc=0.691, loss=59.908, grad_norm=5.176, loss_scale=1.000, learning_rate=8.566e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 10:06:34,137 (deepspeed_trainer:228) INFO: 22epoch:train:12101-12200batch: iter_time=1.141e-04, loss_ctc=75.998, loss_att=57.063, acc=0.707, loss=62.739, grad_norm=5.038, loss_scale=1.000, learning_rate=8.565e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:07:11,694 (deepspeed_trainer:228) INFO: 22epoch:train:12201-12300batch: iter_time=1.105e-04, loss_ctc=78.918, loss_att=56.755, acc=0.696, loss=63.373, grad_norm=5.211, loss_scale=1.000, learning_rate=8.564e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:07:48,727 (deepspeed_trainer:228) INFO: 22epoch:train:12301-12400batch: iter_time=1.086e-04, loss_ctc=70.351, loss_att=55.557, acc=0.703, loss=60.009, grad_norm=4.600, loss_scale=1.000, learning_rate=8.562e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 10:08:26,046 (deepspeed_trainer:228) INFO: 22epoch:train:12401-12500batch: iter_time=1.113e-04, loss_ctc=75.122, loss_att=56.868, acc=0.704, loss=62.335, grad_norm=4.862, loss_scale=1.000, learning_rate=8.561e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:09:03,665 (deepspeed_trainer:228) INFO: 22epoch:train:12501-12600batch: iter_time=1.113e-04, loss_ctc=74.812, loss_att=51.010, acc=0.718, loss=58.165, grad_norm=4.875, loss_scale=1.000, learning_rate=8.560e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 10:09:40,551 (deepspeed_trainer:228) INFO: 22epoch:train:12601-12700batch: iter_time=1.137e-04, loss_ctc=65.043, loss_att=52.941, acc=0.711, loss=56.547, grad_norm=3.886, loss_scale=1.000, learning_rate=8.559e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 10:10:17,406 (deepspeed_trainer:228) INFO: 22epoch:train:12701-12800batch: iter_time=1.157e-04, loss_ctc=66.404, loss_att=49.542, acc=0.715, loss=54.599, grad_norm=4.569, loss_scale=1.000, learning_rate=8.557e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 10:10:54,528 (deepspeed_trainer:228) INFO: 22epoch:train:12801-12900batch: iter_time=1.193e-04, loss_ctc=77.130, loss_att=56.661, acc=0.711, loss=62.794, grad_norm=4.738, loss_scale=1.000, learning_rate=8.556e-05, step_time=0.371 [2024-12-07 10:11:32,519] [INFO] [logging.py:129:log_dist] [Rank 0] step=328000, skipped=0, lr=[np.float64(8.553976188095108e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:11:32,520] [INFO] [timer.py:264:stop] epoch=0/micro_step=73000/global_step=73000, RunningAvgSamplesPerSec=43.83916431809112, CurrSamplesPerSec=45.007330501953746, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:11:32,521 (deepspeed_trainer:228) INFO: 22epoch:train:12901-13000batch: iter_time=1.111e-04, loss_ctc=88.083, loss_att=68.564, acc=0.698, loss=74.455, grad_norm=5.039, loss_scale=1.000, learning_rate=8.555e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 10:12:10,163 (deepspeed_trainer:228) INFO: 22epoch:train:13001-13100batch: iter_time=1.129e-04, loss_ctc=71.272, loss_att=59.785, acc=0.700, loss=63.242, grad_norm=5.398, loss_scale=1.000, learning_rate=8.553e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 10:12:24,000 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 10:12:51,179 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 10:13:07,634 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 10:13:07,635 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 10:13:07,637 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 10:13:31,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:31,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:29,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:33,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:29,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:30,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:30,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:30,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:31,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:31,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:31,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:35,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:36,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:36,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:37,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:13:37,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:21,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:17,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:21,691] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:21,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:18,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:18,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:20,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:20,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:20,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:21,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:21,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:26,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:27,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:27,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:28,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:14:28,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:05,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:09,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:10,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:06,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:08,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:08,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:09,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:09,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:09,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:13,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:10,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:15,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:16,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:16,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:17,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:17,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:57,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:57,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:54,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:54,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:56,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:57,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:57,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:57,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:57,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:15:58,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:16:05,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:16:05,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:16:06,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:16:07,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:16:08,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:16:08,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 10:17:04,362 (deepspeed_trainer:228) INFO: 22epoch:train:13101-13200batch: iter_time=2.543, loss_ctc=73.158, loss_att=54.503, acc=0.712, loss=60.087, grad_norm=5.112, loss_scale=1.000, learning_rate=8.552e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-07 10:17:41,812 (deepspeed_trainer:228) INFO: 22epoch:train:13201-13300batch: iter_time=1.079e-04, loss_ctc=69.743, loss_att=52.494, acc=0.721, loss=57.668, grad_norm=4.421, loss_scale=1.000, learning_rate=8.551e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:18:19,503 (deepspeed_trainer:228) INFO: 22epoch:train:13301-13400batch: iter_time=1.050e-04, loss_ctc=77.292, loss_att=61.022, acc=0.707, loss=65.890, grad_norm=4.507, loss_scale=1.000, learning_rate=8.549e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 10:18:57,160 (deepspeed_trainer:228) INFO: 22epoch:train:13401-13500batch: iter_time=1.056e-04, loss_ctc=75.409, loss_att=58.210, acc=0.708, loss=63.359, grad_norm=4.536, loss_scale=1.000, learning_rate=8.548e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 10:19:34,624 (deepspeed_trainer:228) INFO: 22epoch:train:13501-13600batch: iter_time=1.077e-04, loss_ctc=74.297, loss_att=57.295, acc=0.702, loss=62.423, grad_norm=4.743, loss_scale=1.000, learning_rate=8.547e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:20:12,114 (deepspeed_trainer:228) INFO: 22epoch:train:13601-13700batch: iter_time=1.098e-04, loss_ctc=72.866, loss_att=54.212, acc=0.704, loss=59.832, grad_norm=4.731, loss_scale=1.000, learning_rate=8.546e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 10:20:49,467 (deepspeed_trainer:228) INFO: 22epoch:train:13701-13800batch: iter_time=1.153e-04, loss_ctc=71.102, loss_att=56.260, acc=0.715, loss=60.734, grad_norm=4.297, loss_scale=1.000, learning_rate=8.544e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:21:26,721 (deepspeed_trainer:228) INFO: 22epoch:train:13801-13900batch: iter_time=1.059e-04, loss_ctc=69.564, loss_att=56.190, acc=0.699, loss=60.188, grad_norm=4.973, loss_scale=1.000, learning_rate=8.543e-05, step_time=0.372 [2024-12-07 10:22:03,931] [INFO] [logging.py:129:log_dist] [Rank 0] step=329000, skipped=0, lr=[np.float64(8.54096637042376e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:22:03,932] [INFO] [timer.py:264:stop] epoch=0/micro_step=74000/global_step=74000, RunningAvgSamplesPerSec=43.834479409220165, CurrSamplesPerSec=38.690963123436774, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:22:03,934 (deepspeed_trainer:228) INFO: 22epoch:train:13901-14000batch: iter_time=1.057e-04, loss_ctc=71.229, loss_att=53.402, acc=0.705, loss=58.774, grad_norm=5.049, loss_scale=1.000, learning_rate=8.542e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:22:41,396 (deepspeed_trainer:228) INFO: 22epoch:train:14001-14100batch: iter_time=1.059e-04, loss_ctc=83.780, loss_att=63.395, acc=0.703, loss=69.509, grad_norm=5.318, loss_scale=1.000, learning_rate=8.540e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:23:18,582 (deepspeed_trainer:228) INFO: 22epoch:train:14101-14200batch: iter_time=1.073e-04, loss_ctc=68.664, loss_att=51.524, acc=0.708, loss=56.684, grad_norm=4.487, loss_scale=1.000, learning_rate=8.539e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 10:23:56,342 (deepspeed_trainer:228) INFO: 22epoch:train:14201-14300batch: iter_time=1.059e-04, loss_ctc=67.961, loss_att=52.911, acc=0.702, loss=57.443, grad_norm=5.347, loss_scale=1.000, learning_rate=8.538e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 10:24:33,692 (deepspeed_trainer:228) INFO: 22epoch:train:14301-14400batch: iter_time=1.092e-04, loss_ctc=77.639, loss_att=55.553, acc=0.722, loss=62.189, grad_norm=5.512, loss_scale=1.000, learning_rate=8.536e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:25:11,050 (deepspeed_trainer:228) INFO: 22epoch:train:14401-14500batch: iter_time=1.092e-04, loss_ctc=74.062, loss_att=51.588, acc=0.733, loss=58.319, grad_norm=4.746, loss_scale=1.000, learning_rate=8.535e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:25:48,204 (deepspeed_trainer:228) INFO: 22epoch:train:14501-14600batch: iter_time=1.068e-04, loss_ctc=64.221, loss_att=55.104, acc=0.704, loss=57.818, grad_norm=4.331, loss_scale=1.000, learning_rate=8.534e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:26:25,239 (deepspeed_trainer:228) INFO: 22epoch:train:14601-14700batch: iter_time=1.064e-04, loss_ctc=67.606, loss_att=47.316, acc=0.722, loss=53.422, grad_norm=5.075, loss_scale=1.000, learning_rate=8.533e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 10:27:02,805 (deepspeed_trainer:228) INFO: 22epoch:train:14701-14800batch: iter_time=1.059e-04, loss_ctc=79.452, loss_att=62.493, acc=0.709, loss=67.583, grad_norm=4.461, loss_scale=1.000, learning_rate=8.531e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:27:40,397 (deepspeed_trainer:228) INFO: 22epoch:train:14801-14900batch: iter_time=1.043e-04, loss_ctc=83.373, loss_att=65.385, acc=0.706, loss=70.763, grad_norm=5.078, loss_scale=1.000, learning_rate=8.530e-05, step_time=0.375 [2024-12-07 10:28:17,408] [INFO] [logging.py:129:log_dist] [Rank 0] step=330000, skipped=0, lr=[np.float64(8.528015732998248e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:28:17,409] [INFO] [timer.py:264:stop] epoch=0/micro_step=75000/global_step=75000, RunningAvgSamplesPerSec=43.835764673477264, CurrSamplesPerSec=44.346855940788245, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:28:17,410 (deepspeed_trainer:228) INFO: 22epoch:train:14901-15000batch: iter_time=1.049e-04, loss_ctc=73.095, loss_att=57.505, acc=0.714, loss=62.192, grad_norm=5.060, loss_scale=1.000, learning_rate=8.529e-05, step_time=0.370 [2024-12-07 10:28:30,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:34,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:30,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:30,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:34,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:30,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:34,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:31,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:34,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:34,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:31,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:31,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:31,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:35,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:35,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:35,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:45,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:46,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:50,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:50,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:46,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:50,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:47,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:47,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:47,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:51,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:48,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:48,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:51,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:52,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:52,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:28:52,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:01,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:02,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:05,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:02,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:06,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:06,581] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:02,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:03,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:07,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:07,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:03,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:03,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:04,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:08,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:08,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:08,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:16,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:17,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:21,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:21,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:22,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:18,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:18,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:19,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:23,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:19,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:19,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:19,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:23,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:23,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:24,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:24,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:29:37,044] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 22 is about to be saved! [2024-12-07 10:29:37,077] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/mp_rank_00_model_states.pt [2024-12-07 10:29:37,077] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/mp_rank_00_model_states.pt... [2024-12-07 10:29:39,132] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/mp_rank_00_model_states.pt. [2024-12-07 10:29:35,465] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,287] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,291] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,470] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,471] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,471] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,292] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,293] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 10:29:35,472] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,294] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,294] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,295] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,295] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 10:29:39,879] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 10:29:39,879] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 10:29:39,880] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,046] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,046] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,046] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,054] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,054] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,054] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,095] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,112] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,112] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,117] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,117] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,117] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,131] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,131] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,131] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,136] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,137] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,137] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:40,137] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 10:29:40,137] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 10:29:40,137] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,335] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,335] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,335] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,335] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,335] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,336] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,336] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,336] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,336] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,336] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,336] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,336] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,364] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,364] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,364] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [2024-12-07 10:29:36,368] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 10:29:36,368] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_22/22/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 10:29:36,368] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 22 is ready now! [cnode7-012:0/16] 2024-12-07 10:29:40,220 (deepspeed_trainer:158) INFO: 22epoch results: [train] iter_time=0.135, loss_ctc=74.172, loss_att=56.485, acc=0.705, loss=61.792, grad_norm=4.696, loss_scale=1.000, learning_rate=8.627e-05, step_time=0.374, time=2 hours, 7 minutes and 35.93 seconds, total_count=330022, gpu_max_cached_mem_GB=27.986, [valid] loss_ctc=4.031, cer_ctc=0.123, loss_att=8.000, acc=0.745, cer=0.489, wer=1.000, loss=6.812, time=1 minute and 9.59 seconds, total_count=22, gpu_max_cached_mem_GB=27.986 [cnode7-012:0/16] 2024-12-07 10:29:41,416 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 10:30:08,323 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 10:30:24,273 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 10:30:24,274 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 10:30:24,276 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 10:30:41,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:41,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:45,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:42,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:42,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:42,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:42,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:43,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:43,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:48,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:48,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:48,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:49,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:49,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:49,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:30:50,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:28,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:32,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:29,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:29,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:30,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:30,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:31,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:32,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:33,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:38,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:38,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:38,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:39,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:39,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:39,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:31:39,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:15,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:16,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:20,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:17,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:17,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:18,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:18,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:19,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:27,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:23,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:27,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:28,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:28,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:28,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:29,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:32:31,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:03,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:03,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:07,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:05,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:05,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:06,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:06,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:08,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:16,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:12,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:17,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:18,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:18,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:18,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:19,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:33:22,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 10:34:26,247 (deepspeed_trainer:228) INFO: 23epoch:train:1-100batch: iter_time=2.451, loss_ctc=68.600, loss_att=54.940, acc=0.714, loss=59.035, grad_norm=4.611, loss_scale=1.000, learning_rate=8.527e-05, step_time=0.397 [cnode7-012:0/16] 2024-12-07 10:35:03,921 (deepspeed_trainer:228) INFO: 23epoch:train:101-200batch: iter_time=1.039e-04, loss_ctc=72.579, loss_att=58.818, acc=0.699, loss=62.963, grad_norm=4.579, loss_scale=1.000, learning_rate=8.526e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 10:35:41,372 (deepspeed_trainer:228) INFO: 23epoch:train:201-300batch: iter_time=1.072e-04, loss_ctc=68.294, loss_att=55.815, acc=0.704, loss=59.581, grad_norm=4.646, loss_scale=1.000, learning_rate=8.525e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 10:36:18,571 (deepspeed_trainer:228) INFO: 23epoch:train:301-400batch: iter_time=1.052e-04, loss_ctc=78.968, loss_att=58.071, acc=0.696, loss=64.356, grad_norm=5.764, loss_scale=1.000, learning_rate=8.523e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:36:56,088 (deepspeed_trainer:228) INFO: 23epoch:train:401-500batch: iter_time=1.079e-04, loss_ctc=77.557, loss_att=57.873, acc=0.704, loss=63.807, grad_norm=4.722, loss_scale=1.000, learning_rate=8.522e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:37:33,291 (deepspeed_trainer:228) INFO: 23epoch:train:501-600batch: iter_time=1.091e-04, loss_ctc=72.816, loss_att=59.157, acc=0.703, loss=63.234, grad_norm=4.863, loss_scale=1.000, learning_rate=8.521e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 10:38:10,844 (deepspeed_trainer:228) INFO: 23epoch:train:601-700batch: iter_time=1.101e-04, loss_ctc=72.915, loss_att=54.746, acc=0.709, loss=60.174, grad_norm=4.649, loss_scale=1.000, learning_rate=8.520e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:38:48,205 (deepspeed_trainer:228) INFO: 23epoch:train:701-800batch: iter_time=1.096e-04, loss_ctc=83.493, loss_att=55.713, acc=0.702, loss=64.062, grad_norm=5.410, loss_scale=1.000, learning_rate=8.518e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:39:26,031 (deepspeed_trainer:228) INFO: 23epoch:train:801-900batch: iter_time=1.091e-04, loss_ctc=90.987, loss_att=69.307, acc=0.703, loss=75.817, grad_norm=6.170, loss_scale=1.000, learning_rate=8.517e-05, step_time=0.378 [2024-12-07 10:40:03,167] [INFO] [logging.py:129:log_dist] [Rank 0] step=331000, skipped=0, lr=[np.float64(8.515123828500762e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:40:03,168] [INFO] [timer.py:264:stop] epoch=0/micro_step=76000/global_step=76000, RunningAvgSamplesPerSec=43.83182232471169, CurrSamplesPerSec=45.09221871369181, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:40:03,170 (deepspeed_trainer:228) INFO: 23epoch:train:901-1000batch: iter_time=1.107e-04, loss_ctc=65.156, loss_att=48.359, acc=0.714, loss=53.405, grad_norm=4.488, loss_scale=1.000, learning_rate=8.516e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:40:40,301 (deepspeed_trainer:228) INFO: 23epoch:train:1001-1100batch: iter_time=1.104e-04, loss_ctc=70.979, loss_att=53.873, acc=0.704, loss=59.005, grad_norm=4.639, loss_scale=1.000, learning_rate=8.514e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:41:18,150 (deepspeed_trainer:228) INFO: 23epoch:train:1101-1200batch: iter_time=1.093e-04, loss_ctc=74.698, loss_att=61.828, acc=0.700, loss=65.706, grad_norm=4.548, loss_scale=1.000, learning_rate=8.513e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 10:41:55,526 (deepspeed_trainer:228) INFO: 23epoch:train:1201-1300batch: iter_time=1.077e-04, loss_ctc=81.766, loss_att=58.191, acc=0.714, loss=65.263, grad_norm=5.074, loss_scale=1.000, learning_rate=8.512e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:42:33,091 (deepspeed_trainer:228) INFO: 23epoch:train:1301-1400batch: iter_time=1.089e-04, loss_ctc=79.584, loss_att=60.412, acc=0.713, loss=66.168, grad_norm=4.846, loss_scale=1.000, learning_rate=8.511e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 10:43:10,322 (deepspeed_trainer:228) INFO: 23epoch:train:1401-1500batch: iter_time=1.077e-04, loss_ctc=72.923, loss_att=54.071, acc=0.724, loss=59.696, grad_norm=4.671, loss_scale=1.000, learning_rate=8.509e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:43:47,992 (deepspeed_trainer:228) INFO: 23epoch:train:1501-1600batch: iter_time=1.099e-04, loss_ctc=83.574, loss_att=62.932, acc=0.698, loss=69.105, grad_norm=5.152, loss_scale=1.000, learning_rate=8.508e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 10:44:25,132 (deepspeed_trainer:228) INFO: 23epoch:train:1601-1700batch: iter_time=1.076e-04, loss_ctc=69.525, loss_att=52.910, acc=0.716, loss=57.890, grad_norm=4.899, loss_scale=1.000, learning_rate=8.507e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:45:02,742 (deepspeed_trainer:228) INFO: 23epoch:train:1701-1800batch: iter_time=1.073e-04, loss_ctc=76.373, loss_att=67.047, acc=0.693, loss=69.830, grad_norm=5.002, loss_scale=1.000, learning_rate=8.505e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 10:45:35,097 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 10:46:02,004 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 10:46:18,574 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 10:46:18,574 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 10:46:18,576 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 10:46:43,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:43,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:43,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:46,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:48,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:49,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:46:47,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:32,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:32,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:34,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:36,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:37,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:38,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:38,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:39,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:41,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:42,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:42,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:42,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:42,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:42,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:43,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:47:43,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:21,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:21,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:23,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:25,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:27,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:27,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:27,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:28,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:28,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:31,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:31,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:32,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:32,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:32,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:32,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:48:32,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:08,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:09,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:11,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:14,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:15,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:16,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:16,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:16,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:15,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:19,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:20,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:20,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:21,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:21,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:21,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 10:49:24,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 10:50:04,498 (deepspeed_trainer:228) INFO: 23epoch:train:1801-1900batch: iter_time=2.533, loss_ctc=69.082, loss_att=53.247, acc=0.700, loss=58.004, grad_norm=4.504, loss_scale=1.000, learning_rate=8.504e-05, step_time=0.484 [2024-12-07 10:50:42,428] [INFO] [logging.py:129:log_dist] [Rank 0] step=332000, skipped=0, lr=[np.float64(8.502290214332742e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:50:42,428] [INFO] [timer.py:264:stop] epoch=0/micro_step=77000/global_step=77000, RunningAvgSamplesPerSec=43.81352200952583, CurrSamplesPerSec=45.16286448532218, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:50:42,430 (deepspeed_trainer:228) INFO: 23epoch:train:1901-2000batch: iter_time=1.083e-04, loss_ctc=76.013, loss_att=64.048, acc=0.701, loss=67.593, grad_norm=4.577, loss_scale=1.000, learning_rate=8.503e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 10:51:19,803 (deepspeed_trainer:228) INFO: 23epoch:train:2001-2100batch: iter_time=1.078e-04, loss_ctc=63.440, loss_att=53.895, acc=0.696, loss=56.753, grad_norm=4.691, loss_scale=1.000, learning_rate=8.502e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:51:56,605 (deepspeed_trainer:228) INFO: 23epoch:train:2101-2200batch: iter_time=1.092e-04, loss_ctc=73.774, loss_att=60.333, acc=0.696, loss=64.350, grad_norm=5.258, loss_scale=1.000, learning_rate=8.500e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 10:52:33,383 (deepspeed_trainer:228) INFO: 23epoch:train:2201-2300batch: iter_time=1.098e-04, loss_ctc=77.438, loss_att=55.107, acc=0.701, loss=61.801, grad_norm=5.395, loss_scale=1.000, learning_rate=8.499e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 10:53:10,197 (deepspeed_trainer:228) INFO: 23epoch:train:2301-2400batch: iter_time=1.152e-04, loss_ctc=77.067, loss_att=59.407, acc=0.700, loss=64.730, grad_norm=4.770, loss_scale=1.000, learning_rate=8.498e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 10:53:47,153 (deepspeed_trainer:228) INFO: 23epoch:train:2401-2500batch: iter_time=1.103e-04, loss_ctc=72.299, loss_att=56.427, acc=0.702, loss=61.213, grad_norm=4.638, loss_scale=1.000, learning_rate=8.497e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 10:54:24,151 (deepspeed_trainer:228) INFO: 23epoch:train:2501-2600batch: iter_time=1.100e-04, loss_ctc=70.061, loss_att=53.152, acc=0.705, loss=58.208, grad_norm=4.400, loss_scale=1.000, learning_rate=8.495e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 10:55:01,644 (deepspeed_trainer:228) INFO: 23epoch:train:2601-2700batch: iter_time=1.096e-04, loss_ctc=84.957, loss_att=57.524, acc=0.695, loss=65.758, grad_norm=5.755, loss_scale=1.000, learning_rate=8.494e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 10:55:38,971 (deepspeed_trainer:228) INFO: 23epoch:train:2701-2800batch: iter_time=1.086e-04, loss_ctc=78.741, loss_att=58.838, acc=0.703, loss=64.795, grad_norm=6.079, loss_scale=1.000, learning_rate=8.493e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 10:56:16,244 (deepspeed_trainer:228) INFO: 23epoch:train:2801-2900batch: iter_time=1.109e-04, loss_ctc=71.761, loss_att=52.106, acc=0.711, loss=58.016, grad_norm=4.502, loss_scale=1.000, learning_rate=8.491e-05, step_time=0.372 [2024-12-07 10:56:53,466] [INFO] [logging.py:129:log_dist] [Rank 0] step=333000, skipped=0, lr=[np.float64(8.48951445255105e-05)], mom=[[0.9, 0.98]] [2024-12-07 10:56:53,467] [INFO] [timer.py:264:stop] epoch=0/micro_step=78000/global_step=78000, RunningAvgSamplesPerSec=43.81872404834681, CurrSamplesPerSec=38.800352256619966, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 10:56:53,468 (deepspeed_trainer:228) INFO: 23epoch:train:2901-3000batch: iter_time=1.101e-04, loss_ctc=70.358, loss_att=57.779, acc=0.699, loss=61.552, grad_norm=5.067, loss_scale=1.000, learning_rate=8.490e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:57:30,394 (deepspeed_trainer:228) INFO: 23epoch:train:3001-3100batch: iter_time=1.099e-04, loss_ctc=74.579, loss_att=59.483, acc=0.703, loss=63.974, grad_norm=4.935, loss_scale=1.000, learning_rate=8.489e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 10:58:07,212 (deepspeed_trainer:228) INFO: 23epoch:train:3101-3200batch: iter_time=1.089e-04, loss_ctc=81.721, loss_att=56.165, acc=0.707, loss=63.814, grad_norm=5.519, loss_scale=1.000, learning_rate=8.488e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 10:58:44,353 (deepspeed_trainer:228) INFO: 23epoch:train:3201-3300batch: iter_time=1.118e-04, loss_ctc=77.916, loss_att=60.434, acc=0.717, loss=65.681, grad_norm=5.572, loss_scale=1.000, learning_rate=8.486e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 10:59:21,207 (deepspeed_trainer:228) INFO: 23epoch:train:3301-3400batch: iter_time=1.095e-04, loss_ctc=72.109, loss_att=53.724, acc=0.714, loss=59.242, grad_norm=4.114, loss_scale=1.000, learning_rate=8.485e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 10:59:58,381 (deepspeed_trainer:228) INFO: 23epoch:train:3401-3500batch: iter_time=1.105e-04, loss_ctc=81.660, loss_att=60.486, acc=0.702, loss=66.818, grad_norm=4.903, loss_scale=1.000, learning_rate=8.484e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 11:00:35,048 (deepspeed_trainer:228) INFO: 23epoch:train:3501-3600batch: iter_time=1.100e-04, loss_ctc=63.953, loss_att=50.693, acc=0.710, loss=54.662, grad_norm=4.860, loss_scale=1.000, learning_rate=8.483e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 11:01:12,643 (deepspeed_trainer:228) INFO: 23epoch:train:3601-3700batch: iter_time=1.095e-04, loss_ctc=82.443, loss_att=68.198, acc=0.693, loss=72.476, grad_norm=4.953, loss_scale=1.000, learning_rate=8.481e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 11:01:35,731 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 11:02:02,082 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 11:02:17,976 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 11:02:17,976 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 11:02:17,978 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 11:02:43,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:43,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:43,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:40,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:40,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:40,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:41,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:41,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:41,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:41,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:41,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:47,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:48,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:49,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:49,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:02:49,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:30,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:31,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:35,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:36,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:36,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:38,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:38,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:03:39,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:19,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:17,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:18,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:19,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:23,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:20,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:20,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:20,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:20,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:25,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:26,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:26,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:28,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:24,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:28,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:04:29,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:06,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:05,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:08,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:09,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:09,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:09,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:09,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:10,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:14,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:14,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:15,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:11,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:16,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:17,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:17,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:05:17,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 11:06:04,294 (deepspeed_trainer:228) INFO: 23epoch:train:3701-3800batch: iter_time=2.522, loss_ctc=66.792, loss_att=52.921, acc=0.698, loss=57.089, grad_norm=4.527, loss_scale=1.000, learning_rate=8.480e-05, step_time=0.394 [cnode7-012:0/16] 2024-12-07 11:06:42,203 (deepspeed_trainer:228) INFO: 23epoch:train:3801-3900batch: iter_time=1.043e-04, loss_ctc=72.102, loss_att=58.600, acc=0.715, loss=62.681, grad_norm=4.518, loss_scale=1.000, learning_rate=8.479e-05, step_time=0.378 [2024-12-07 11:07:19,891] [INFO] [logging.py:129:log_dist] [Rank 0] step=334000, skipped=0, lr=[np.float64(8.476796109805209e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:07:19,891] [INFO] [timer.py:264:stop] epoch=0/micro_step=79000/global_step=79000, RunningAvgSamplesPerSec=43.818918322307375, CurrSamplesPerSec=44.45125206359328, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:07:19,893 (deepspeed_trainer:228) INFO: 23epoch:train:3901-4000batch: iter_time=1.064e-04, loss_ctc=69.104, loss_att=58.611, acc=0.698, loss=61.759, grad_norm=4.628, loss_scale=1.000, learning_rate=8.477e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 11:07:57,165 (deepspeed_trainer:228) INFO: 23epoch:train:4001-4100batch: iter_time=1.102e-04, loss_ctc=74.884, loss_att=59.097, acc=0.696, loss=63.842, grad_norm=5.087, loss_scale=1.000, learning_rate=8.476e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 11:08:34,519 (deepspeed_trainer:228) INFO: 23epoch:train:4101-4200batch: iter_time=1.099e-04, loss_ctc=75.099, loss_att=56.330, acc=0.713, loss=61.972, grad_norm=4.630, loss_scale=1.000, learning_rate=8.475e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:09:12,752 (deepspeed_trainer:228) INFO: 23epoch:train:4201-4300batch: iter_time=1.101e-04, loss_ctc=75.136, loss_att=58.443, acc=0.705, loss=63.451, grad_norm=5.021, loss_scale=1.000, learning_rate=8.474e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 11:09:50,602 (deepspeed_trainer:228) INFO: 23epoch:train:4301-4400batch: iter_time=1.116e-04, loss_ctc=72.794, loss_att=57.026, acc=0.712, loss=61.775, grad_norm=4.721, loss_scale=1.000, learning_rate=8.472e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 11:10:28,043 (deepspeed_trainer:228) INFO: 23epoch:train:4401-4500batch: iter_time=1.102e-04, loss_ctc=70.994, loss_att=54.273, acc=0.705, loss=59.314, grad_norm=4.552, loss_scale=1.000, learning_rate=8.471e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 11:11:05,504 (deepspeed_trainer:228) INFO: 23epoch:train:4501-4600batch: iter_time=1.110e-04, loss_ctc=86.205, loss_att=60.872, acc=0.701, loss=68.464, grad_norm=5.567, loss_scale=1.000, learning_rate=8.470e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 11:11:42,797 (deepspeed_trainer:228) INFO: 23epoch:train:4601-4700batch: iter_time=1.081e-04, loss_ctc=71.589, loss_att=54.953, acc=0.721, loss=59.928, grad_norm=5.051, loss_scale=1.000, learning_rate=8.469e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:12:20,185 (deepspeed_trainer:228) INFO: 23epoch:train:4701-4800batch: iter_time=1.150e-04, loss_ctc=66.590, loss_att=47.473, acc=0.718, loss=53.200, grad_norm=4.677, loss_scale=1.000, learning_rate=8.467e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:12:57,812 (deepspeed_trainer:228) INFO: 23epoch:train:4801-4900batch: iter_time=1.096e-04, loss_ctc=75.927, loss_att=61.976, acc=0.703, loss=66.140, grad_norm=4.605, loss_scale=1.000, learning_rate=8.466e-05, step_time=0.376 [2024-12-07 11:13:35,631] [INFO] [logging.py:129:log_dist] [Rank 0] step=335000, skipped=0, lr=[np.float64(8.464134757275656e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:13:35,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=80000/global_step=80000, RunningAvgSamplesPerSec=43.81727185987014, CurrSamplesPerSec=43.815632664812405, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:13:35,633 (deepspeed_trainer:228) INFO: 23epoch:train:4901-5000batch: iter_time=1.131e-04, loss_ctc=72.954, loss_att=58.485, acc=0.712, loss=62.836, grad_norm=4.177, loss_scale=1.000, learning_rate=8.465e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 11:14:13,305 (deepspeed_trainer:228) INFO: 23epoch:train:5001-5100batch: iter_time=1.172e-04, loss_ctc=79.414, loss_att=54.515, acc=0.716, loss=61.998, grad_norm=5.473, loss_scale=1.000, learning_rate=8.463e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 11:14:51,332 (deepspeed_trainer:228) INFO: 23epoch:train:5101-5200batch: iter_time=1.092e-04, loss_ctc=79.232, loss_att=60.457, acc=0.725, loss=66.100, grad_norm=4.373, loss_scale=1.000, learning_rate=8.462e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 11:15:28,677 (deepspeed_trainer:228) INFO: 23epoch:train:5201-5300batch: iter_time=1.085e-04, loss_ctc=74.377, loss_att=55.143, acc=0.715, loss=60.898, grad_norm=4.848, loss_scale=1.000, learning_rate=8.461e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:16:06,210 (deepspeed_trainer:228) INFO: 23epoch:train:5301-5400batch: iter_time=1.085e-04, loss_ctc=72.265, loss_att=55.897, acc=0.713, loss=60.792, grad_norm=4.744, loss_scale=1.000, learning_rate=8.460e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 11:16:43,495 (deepspeed_trainer:228) INFO: 23epoch:train:5401-5500batch: iter_time=1.101e-04, loss_ctc=73.487, loss_att=57.742, acc=0.712, loss=62.480, grad_norm=5.632, loss_scale=1.000, learning_rate=8.458e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:17:21,118 (deepspeed_trainer:228) INFO: 23epoch:train:5501-5600batch: iter_time=1.109e-04, loss_ctc=76.622, loss_att=67.274, acc=0.690, loss=70.076, grad_norm=4.738, loss_scale=1.000, learning_rate=8.457e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 11:17:35,095 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 11:18:01,373 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 11:18:17,440 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 11:18:17,440 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 11:18:17,443 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 11:18:42,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:43,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:39,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:44,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:40,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:40,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:40,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:41,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:46,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:42,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:42,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:42,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:47,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:47,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:48,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:18:49,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:31,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:31,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:27,691] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:32,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:28,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:29,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:29,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:30,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:30,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:35,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:31,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:36,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:37,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:37,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:38,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:19:34,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:14,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:19,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:20,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:17,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:17,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:17,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:22,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:18,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:19,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:20,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:26,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:26,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:27,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:27,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:28,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:20:24,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:02,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:06,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:08,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:04,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:05,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:09,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:06,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:06,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:07,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:08,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:15,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:16,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:16,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:16,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:17,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:21:14,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 11:22:13,220 (deepspeed_trainer:228) INFO: 23epoch:train:5601-5700batch: iter_time=2.520, loss_ctc=64.784, loss_att=51.671, acc=0.711, loss=55.605, grad_norm=4.182, loss_scale=1.000, learning_rate=8.456e-05, step_time=0.401 [cnode7-012:0/16] 2024-12-07 11:22:50,815 (deepspeed_trainer:228) INFO: 23epoch:train:5701-5800batch: iter_time=1.052e-04, loss_ctc=75.960, loss_att=64.447, acc=0.696, loss=67.868, grad_norm=4.997, loss_scale=1.000, learning_rate=8.455e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 11:23:28,111 (deepspeed_trainer:228) INFO: 23epoch:train:5801-5900batch: iter_time=1.068e-04, loss_ctc=63.440, loss_att=50.757, acc=0.704, loss=54.548, grad_norm=4.288, loss_scale=1.000, learning_rate=8.453e-05, step_time=0.373 [2024-12-07 11:24:05,317] [INFO] [logging.py:129:log_dist] [Rank 0] step=336000, skipped=0, lr=[np.float64(8.45152997061302e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:24:05,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=81000/global_step=81000, RunningAvgSamplesPerSec=43.81271276683261, CurrSamplesPerSec=44.946590448400414, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:24:05,320 (deepspeed_trainer:228) INFO: 23epoch:train:5901-6000batch: iter_time=1.097e-04, loss_ctc=77.631, loss_att=58.191, acc=0.702, loss=64.021, grad_norm=5.318, loss_scale=1.000, learning_rate=8.452e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 11:24:42,137 (deepspeed_trainer:228) INFO: 23epoch:train:6001-6100batch: iter_time=1.116e-04, loss_ctc=73.258, loss_att=54.487, acc=0.706, loss=60.120, grad_norm=4.491, loss_scale=1.000, learning_rate=8.451e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 11:25:19,209 (deepspeed_trainer:228) INFO: 23epoch:train:6101-6200batch: iter_time=1.101e-04, loss_ctc=75.623, loss_att=58.548, acc=0.699, loss=63.635, grad_norm=4.503, loss_scale=1.000, learning_rate=8.450e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 11:25:56,017 (deepspeed_trainer:228) INFO: 23epoch:train:6201-6300batch: iter_time=1.086e-04, loss_ctc=70.570, loss_att=56.146, acc=0.704, loss=60.504, grad_norm=4.751, loss_scale=1.000, learning_rate=8.448e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 11:26:32,629 (deepspeed_trainer:228) INFO: 23epoch:train:6301-6400batch: iter_time=1.135e-04, loss_ctc=68.635, loss_att=50.973, acc=0.707, loss=56.281, grad_norm=4.281, loss_scale=1.000, learning_rate=8.447e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 11:27:09,637 (deepspeed_trainer:228) INFO: 23epoch:train:6401-6500batch: iter_time=1.081e-04, loss_ctc=94.007, loss_att=67.260, acc=0.692, loss=75.271, grad_norm=6.318, loss_scale=1.000, learning_rate=8.446e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 11:27:46,295 (deepspeed_trainer:228) INFO: 23epoch:train:6501-6600batch: iter_time=1.090e-04, loss_ctc=65.108, loss_att=48.099, acc=0.719, loss=53.184, grad_norm=4.221, loss_scale=1.000, learning_rate=8.445e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 11:28:23,048 (deepspeed_trainer:228) INFO: 23epoch:train:6601-6700batch: iter_time=1.088e-04, loss_ctc=68.136, loss_att=50.383, acc=0.714, loss=55.709, grad_norm=4.436, loss_scale=1.000, learning_rate=8.443e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 11:28:59,932 (deepspeed_trainer:228) INFO: 23epoch:train:6701-6800batch: iter_time=1.096e-04, loss_ctc=76.200, loss_att=64.317, acc=0.695, loss=67.844, grad_norm=5.291, loss_scale=1.000, learning_rate=8.442e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 11:29:36,690 (deepspeed_trainer:228) INFO: 23epoch:train:6801-6900batch: iter_time=1.088e-04, loss_ctc=77.335, loss_att=56.478, acc=0.707, loss=62.719, grad_norm=5.565, loss_scale=1.000, learning_rate=8.441e-05, step_time=0.367 [2024-12-07 11:30:13,656] [INFO] [logging.py:129:log_dist] [Rank 0] step=337000, skipped=0, lr=[np.float64(8.438981329878385e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:30:13,656] [INFO] [timer.py:264:stop] epoch=0/micro_step=82000/global_step=82000, RunningAvgSamplesPerSec=43.820905646479495, CurrSamplesPerSec=44.31221488255472, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:30:13,658 (deepspeed_trainer:228) INFO: 23epoch:train:6901-7000batch: iter_time=1.086e-04, loss_ctc=77.478, loss_att=55.047, acc=0.718, loss=61.751, grad_norm=4.867, loss_scale=1.000, learning_rate=8.440e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 11:30:50,708 (deepspeed_trainer:228) INFO: 23epoch:train:7001-7100batch: iter_time=1.143e-04, loss_ctc=73.176, loss_att=56.325, acc=0.719, loss=61.394, grad_norm=4.915, loss_scale=1.000, learning_rate=8.438e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 11:31:28,350 (deepspeed_trainer:228) INFO: 23epoch:train:7101-7200batch: iter_time=1.142e-04, loss_ctc=75.999, loss_att=58.546, acc=0.700, loss=63.769, grad_norm=4.875, loss_scale=1.000, learning_rate=8.437e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 11:32:05,228 (deepspeed_trainer:228) INFO: 23epoch:train:7201-7300batch: iter_time=1.104e-04, loss_ctc=75.796, loss_att=55.895, acc=0.717, loss=61.831, grad_norm=5.076, loss_scale=1.000, learning_rate=8.436e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 11:32:42,657 (deepspeed_trainer:228) INFO: 23epoch:train:7301-7400batch: iter_time=1.146e-04, loss_ctc=74.326, loss_att=62.071, acc=0.701, loss=65.772, grad_norm=6.101, loss_scale=1.000, learning_rate=8.435e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 11:33:19,279 (deepspeed_trainer:228) INFO: 23epoch:train:7401-7500batch: iter_time=1.120e-04, loss_ctc=69.572, loss_att=55.701, acc=0.697, loss=59.873, grad_norm=4.568, loss_scale=1.000, learning_rate=8.433e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 11:33:23,590 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 11:33:50,524 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 11:34:07,605 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 11:34:07,606 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 11:34:07,608 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 11:34:31,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:31,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:28,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:28,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:29,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:29,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:29,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:34,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:30,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:30,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:36,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:36,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:36,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:36,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:37,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:34:33,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:19,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:16,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:18,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:18,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:18,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:18,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:23,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:19,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:19,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:24,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:25,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:25,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:26,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:26,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:27,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:35:23,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:06,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:04,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:05,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:06,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:06,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:07,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:07,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:08,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:12,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:13,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:14,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:15,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:16,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:16,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:12,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:19,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:54,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:52,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:54,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:55,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:55,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:56,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:56,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:03,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:03,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:36:59,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:00,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:05,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:06,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:06,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:06,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:37:09,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 11:38:14,412 (deepspeed_trainer:228) INFO: 23epoch:train:7501-7600batch: iter_time=2.575, loss_ctc=67.359, loss_att=55.210, acc=0.713, loss=58.847, grad_norm=4.307, loss_scale=1.000, learning_rate=8.432e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 11:38:49,337 (deepspeed_trainer:228) INFO: 23epoch:train:7601-7700batch: iter_time=1.126e-04, loss_ctc=70.495, loss_att=58.197, acc=0.702, loss=61.902, grad_norm=4.548, loss_scale=1.000, learning_rate=8.431e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 11:39:26,385 (deepspeed_trainer:228) INFO: 23epoch:train:7701-7800batch: iter_time=1.075e-04, loss_ctc=67.116, loss_att=54.563, acc=0.711, loss=58.336, grad_norm=4.647, loss_scale=1.000, learning_rate=8.430e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 11:40:03,259 (deepspeed_trainer:228) INFO: 23epoch:train:7801-7900batch: iter_time=1.105e-04, loss_ctc=76.399, loss_att=57.135, acc=0.703, loss=62.893, grad_norm=4.899, loss_scale=1.000, learning_rate=8.428e-05, step_time=0.368 [2024-12-07 11:40:40,219] [INFO] [logging.py:129:log_dist] [Rank 0] step=338000, skipped=0, lr=[np.float64(8.426488419484528e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:40:40,220] [INFO] [timer.py:264:stop] epoch=0/micro_step=83000/global_step=83000, RunningAvgSamplesPerSec=43.82851903372652, CurrSamplesPerSec=42.240646626181466, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:40:40,222 (deepspeed_trainer:228) INFO: 23epoch:train:7901-8000batch: iter_time=1.106e-04, loss_ctc=75.649, loss_att=57.533, acc=0.708, loss=62.954, grad_norm=4.869, loss_scale=1.000, learning_rate=8.427e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 11:41:17,385 (deepspeed_trainer:228) INFO: 23epoch:train:8001-8100batch: iter_time=1.112e-04, loss_ctc=71.385, loss_att=58.273, acc=0.706, loss=62.202, grad_norm=4.727, loss_scale=1.000, learning_rate=8.426e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 11:41:54,424 (deepspeed_trainer:228) INFO: 23epoch:train:8101-8200batch: iter_time=1.078e-04, loss_ctc=71.159, loss_att=53.926, acc=0.712, loss=59.079, grad_norm=4.605, loss_scale=1.000, learning_rate=8.425e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 11:42:31,293 (deepspeed_trainer:228) INFO: 23epoch:train:8201-8300batch: iter_time=1.080e-04, loss_ctc=79.540, loss_att=54.028, acc=0.710, loss=61.675, grad_norm=4.500, loss_scale=1.000, learning_rate=8.423e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 11:43:08,564 (deepspeed_trainer:228) INFO: 23epoch:train:8301-8400batch: iter_time=1.105e-04, loss_ctc=84.341, loss_att=67.421, acc=0.707, loss=72.485, grad_norm=6.430, loss_scale=1.000, learning_rate=8.422e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:43:45,390 (deepspeed_trainer:228) INFO: 23epoch:train:8401-8500batch: iter_time=1.100e-04, loss_ctc=63.958, loss_att=47.085, acc=0.721, loss=52.163, grad_norm=4.276, loss_scale=1.000, learning_rate=8.421e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 11:44:22,363 (deepspeed_trainer:228) INFO: 23epoch:train:8501-8600batch: iter_time=1.113e-04, loss_ctc=69.069, loss_att=52.462, acc=0.711, loss=57.443, grad_norm=4.635, loss_scale=1.000, learning_rate=8.420e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 11:44:59,491 (deepspeed_trainer:228) INFO: 23epoch:train:8601-8700batch: iter_time=1.124e-04, loss_ctc=73.127, loss_att=61.297, acc=0.705, loss=64.840, grad_norm=4.550, loss_scale=1.000, learning_rate=8.418e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 11:45:37,005 (deepspeed_trainer:228) INFO: 23epoch:train:8701-8800batch: iter_time=1.079e-04, loss_ctc=79.564, loss_att=58.572, acc=0.716, loss=64.917, grad_norm=5.425, loss_scale=1.000, learning_rate=8.417e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 11:46:14,213 (deepspeed_trainer:228) INFO: 23epoch:train:8801-8900batch: iter_time=1.133e-04, loss_ctc=77.319, loss_att=58.916, acc=0.720, loss=64.440, grad_norm=4.827, loss_scale=1.000, learning_rate=8.416e-05, step_time=0.371 [2024-12-07 11:46:51,147] [INFO] [logging.py:129:log_dist] [Rank 0] step=339000, skipped=0, lr=[np.float64(8.414050828138111e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:46:51,150] [INFO] [timer.py:264:stop] epoch=0/micro_step=84000/global_step=84000, RunningAvgSamplesPerSec=43.83349053505295, CurrSamplesPerSec=46.47675462568341, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:46:51,152 (deepspeed_trainer:228) INFO: 23epoch:train:8901-9000batch: iter_time=1.093e-04, loss_ctc=72.097, loss_att=53.551, acc=0.727, loss=59.124, grad_norm=4.611, loss_scale=1.000, learning_rate=8.415e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 11:47:28,311 (deepspeed_trainer:228) INFO: 23epoch:train:9001-9100batch: iter_time=1.081e-04, loss_ctc=81.399, loss_att=61.833, acc=0.703, loss=67.695, grad_norm=5.397, loss_scale=1.000, learning_rate=8.413e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 11:48:05,144 (deepspeed_trainer:228) INFO: 23epoch:train:9101-9200batch: iter_time=1.108e-04, loss_ctc=67.464, loss_att=51.504, acc=0.723, loss=56.294, grad_norm=5.015, loss_scale=1.000, learning_rate=8.412e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 11:48:42,428 (deepspeed_trainer:228) INFO: 23epoch:train:9201-9300batch: iter_time=1.074e-04, loss_ctc=75.562, loss_att=66.086, acc=0.698, loss=68.916, grad_norm=4.814, loss_scale=1.000, learning_rate=8.411e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 11:49:14,851 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 11:49:43,085 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 11:50:00,920 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 11:50:00,920 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 11:50:00,922 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 11:50:21,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:25,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:25,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:25,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:22,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:22,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:22,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:23,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:24,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:24,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:24,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:28,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:29,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:29,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:30,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:50:30,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:09,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:13,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:14,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:11,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:12,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:12,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:12,383] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:12,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:13,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:13,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:17,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:18,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:19,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:19,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:19,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:20,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:57,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:01,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:02,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:51:59,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:00,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:01,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:01,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:01,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:02,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:02,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:06,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:08,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:08,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:08,926] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:09,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:09,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:45,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:49,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:50,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:47,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:47,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:49,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:49,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:49,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:49,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:51,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:57,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:59,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:59,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:52:59,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:53:00,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 11:53:03,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 11:53:40,028 (deepspeed_trainer:228) INFO: 23epoch:train:9301-9400batch: iter_time=2.551, loss_ctc=68.322, loss_att=52.513, acc=0.705, loss=57.273, grad_norm=4.951, loss_scale=1.000, learning_rate=8.410e-05, step_time=0.409 [cnode7-012:0/16] 2024-12-07 11:54:18,162 (deepspeed_trainer:228) INFO: 23epoch:train:9401-9500batch: iter_time=1.192e-04, loss_ctc=74.673, loss_att=62.016, acc=0.708, loss=65.838, grad_norm=5.671, loss_scale=1.000, learning_rate=8.408e-05, step_time=0.381 [cnode7-012:0/16] 2024-12-07 11:54:55,796 (deepspeed_trainer:228) INFO: 23epoch:train:9501-9600batch: iter_time=1.230e-04, loss_ctc=62.216, loss_att=53.216, acc=0.698, loss=55.935, grad_norm=4.519, loss_scale=1.000, learning_rate=8.407e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 11:55:34,028 (deepspeed_trainer:228) INFO: 23epoch:train:9601-9700batch: iter_time=1.142e-04, loss_ctc=72.959, loss_att=59.074, acc=0.700, loss=63.231, grad_norm=4.676, loss_scale=1.000, learning_rate=8.406e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 11:56:12,294 (deepspeed_trainer:228) INFO: 23epoch:train:9701-9800batch: iter_time=1.144e-04, loss_ctc=75.709, loss_att=54.392, acc=0.707, loss=60.782, grad_norm=5.343, loss_scale=1.000, learning_rate=8.405e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 11:56:50,623 (deepspeed_trainer:228) INFO: 23epoch:train:9801-9900batch: iter_time=1.215e-04, loss_ctc=75.781, loss_att=58.768, acc=0.705, loss=63.876, grad_norm=4.970, loss_scale=1.000, learning_rate=8.404e-05, step_time=0.383 [2024-12-07 11:57:28,403] [INFO] [logging.py:129:log_dist] [Rank 0] step=340000, skipped=0, lr=[np.float64(8.401668148782807e-05)], mom=[[0.9, 0.98]] [2024-12-07 11:57:28,404] [INFO] [timer.py:264:stop] epoch=0/micro_step=85000/global_step=85000, RunningAvgSamplesPerSec=43.82492320145063, CurrSamplesPerSec=43.12113101336178, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 11:57:28,405 (deepspeed_trainer:228) INFO: 23epoch:train:9901-10000batch: iter_time=1.185e-04, loss_ctc=70.636, loss_att=55.379, acc=0.706, loss=59.973, grad_norm=4.514, loss_scale=1.000, learning_rate=8.402e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 11:58:06,303 (deepspeed_trainer:228) INFO: 23epoch:train:10001-10100batch: iter_time=1.251e-04, loss_ctc=69.286, loss_att=53.061, acc=0.708, loss=57.919, grad_norm=4.467, loss_scale=1.000, learning_rate=8.401e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 11:58:43,897 (deepspeed_trainer:228) INFO: 23epoch:train:10101-10200batch: iter_time=1.133e-04, loss_ctc=82.055, loss_att=56.660, acc=0.700, loss=64.261, grad_norm=5.062, loss_scale=1.000, learning_rate=8.400e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 11:59:21,041 (deepspeed_trainer:228) INFO: 23epoch:train:10201-10300batch: iter_time=1.075e-04, loss_ctc=74.469, loss_att=57.758, acc=0.706, loss=62.775, grad_norm=5.202, loss_scale=1.000, learning_rate=8.399e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 11:59:58,702 (deepspeed_trainer:228) INFO: 23epoch:train:10301-10400batch: iter_time=1.109e-04, loss_ctc=70.998, loss_att=52.080, acc=0.713, loss=57.763, grad_norm=4.326, loss_scale=1.000, learning_rate=8.397e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 12:00:35,882 (deepspeed_trainer:228) INFO: 23epoch:train:10401-10500batch: iter_time=1.125e-04, loss_ctc=68.737, loss_att=56.710, acc=0.704, loss=60.331, grad_norm=4.448, loss_scale=1.000, learning_rate=8.396e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:01:13,173 (deepspeed_trainer:228) INFO: 23epoch:train:10501-10600batch: iter_time=1.093e-04, loss_ctc=73.258, loss_att=58.967, acc=0.708, loss=63.264, grad_norm=4.745, loss_scale=1.000, learning_rate=8.395e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:01:50,455 (deepspeed_trainer:228) INFO: 23epoch:train:10601-10700batch: iter_time=1.099e-04, loss_ctc=78.870, loss_att=55.238, acc=0.711, loss=62.318, grad_norm=5.392, loss_scale=1.000, learning_rate=8.394e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:02:27,869 (deepspeed_trainer:228) INFO: 23epoch:train:10701-10800batch: iter_time=1.128e-04, loss_ctc=77.013, loss_att=59.991, acc=0.721, loss=65.093, grad_norm=5.009, loss_scale=1.000, learning_rate=8.392e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:03:04,969 (deepspeed_trainer:228) INFO: 23epoch:train:10801-10900batch: iter_time=1.091e-04, loss_ctc=71.283, loss_att=53.741, acc=0.715, loss=58.986, grad_norm=4.168, loss_scale=1.000, learning_rate=8.391e-05, step_time=0.370 [2024-12-07 12:03:42,378] [INFO] [logging.py:129:log_dist] [Rank 0] step=341000, skipped=0, lr=[np.float64(8.389339978543348e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:03:42,380] [INFO] [timer.py:264:stop] epoch=0/micro_step=86000/global_step=86000, RunningAvgSamplesPerSec=43.82572695210093, CurrSamplesPerSec=43.54135626062215, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 12:03:42,381 (deepspeed_trainer:228) INFO: 23epoch:train:10901-11000batch: iter_time=1.087e-04, loss_ctc=80.346, loss_att=59.333, acc=0.708, loss=65.629, grad_norm=4.684, loss_scale=1.000, learning_rate=8.390e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:04:19,566 (deepspeed_trainer:228) INFO: 23epoch:train:11001-11100batch: iter_time=1.160e-04, loss_ctc=62.886, loss_att=50.166, acc=0.715, loss=53.978, grad_norm=4.733, loss_scale=1.000, learning_rate=8.389e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:04:57,017 (deepspeed_trainer:228) INFO: 23epoch:train:11101-11200batch: iter_time=1.070e-04, loss_ctc=82.043, loss_att=68.037, acc=0.696, loss=72.260, grad_norm=5.203, loss_scale=1.000, learning_rate=8.387e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 12:05:20,377 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 12:05:47,780 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 12:06:04,331 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 12:06:04,331 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 12:06:04,334 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 12:06:29,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:29,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:26,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:30,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:26,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:31,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:27,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:27,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:27,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:27,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:31,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:28,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:28,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:33,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:33,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:06:33,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:13,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:14,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:15,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:15,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:19,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:19,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:19,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:19,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:16,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:17,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:17,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:17,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:21,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:23,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:24,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:07:24,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:01,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:02,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:03,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:03,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:07,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:04,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:07,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:04,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:08,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:04,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:08,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:05,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:10,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:12,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:12,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:13,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:48,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:49,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:50,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:51,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:52,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:52,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:52,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:56,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:53,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:56,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:57,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:58,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:08:59,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:09:02,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:09:03,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:09:05,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 12:09:50,049 (deepspeed_trainer:228) INFO: 23epoch:train:11201-11300batch: iter_time=2.529, loss_ctc=65.749, loss_att=52.434, acc=0.700, loss=56.466, grad_norm=4.656, loss_scale=1.000, learning_rate=8.386e-05, step_time=0.401 [cnode7-012:0/16] 2024-12-07 12:10:27,131 (deepspeed_trainer:228) INFO: 23epoch:train:11301-11400batch: iter_time=1.058e-04, loss_ctc=70.519, loss_att=56.088, acc=0.720, loss=60.414, grad_norm=4.403, loss_scale=1.000, learning_rate=8.385e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 12:11:04,031 (deepspeed_trainer:228) INFO: 23epoch:train:11401-11500batch: iter_time=1.060e-04, loss_ctc=67.480, loss_att=57.220, acc=0.695, loss=60.309, grad_norm=4.327, loss_scale=1.000, learning_rate=8.384e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 12:11:40,749 (deepspeed_trainer:228) INFO: 23epoch:train:11501-11600batch: iter_time=1.077e-04, loss_ctc=73.471, loss_att=55.429, acc=0.701, loss=60.842, grad_norm=5.271, loss_scale=1.000, learning_rate=8.383e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 12:12:17,761 (deepspeed_trainer:228) INFO: 23epoch:train:11601-11700batch: iter_time=1.067e-04, loss_ctc=75.052, loss_att=55.206, acc=0.713, loss=61.165, grad_norm=4.705, loss_scale=1.000, learning_rate=8.381e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 12:12:55,108 (deepspeed_trainer:228) INFO: 23epoch:train:11701-11800batch: iter_time=1.081e-04, loss_ctc=73.562, loss_att=57.006, acc=0.703, loss=61.926, grad_norm=4.926, loss_scale=1.000, learning_rate=8.380e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:13:32,538 (deepspeed_trainer:228) INFO: 23epoch:train:11801-11900batch: iter_time=1.086e-04, loss_ctc=71.701, loss_att=54.697, acc=0.708, loss=59.809, grad_norm=4.987, loss_scale=1.000, learning_rate=8.379e-05, step_time=0.374 [2024-12-07 12:14:09,537] [INFO] [logging.py:129:log_dist] [Rank 0] step=342000, skipped=0, lr=[np.float64(8.377065918670467e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:14:09,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=87000/global_step=87000, RunningAvgSamplesPerSec=43.82580989056839, CurrSamplesPerSec=40.57597959976254, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 12:14:09,539 (deepspeed_trainer:228) INFO: 23epoch:train:11901-12000batch: iter_time=1.080e-04, loss_ctc=69.639, loss_att=52.489, acc=0.707, loss=57.597, grad_norm=4.436, loss_scale=1.000, learning_rate=8.378e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 12:14:46,983 (deepspeed_trainer:228) INFO: 23epoch:train:12001-12100batch: iter_time=1.075e-04, loss_ctc=84.005, loss_att=58.998, acc=0.696, loss=66.518, grad_norm=5.870, loss_scale=1.000, learning_rate=8.376e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:15:24,003 (deepspeed_trainer:228) INFO: 23epoch:train:12101-12200batch: iter_time=1.062e-04, loss_ctc=69.865, loss_att=54.302, acc=0.718, loss=58.986, grad_norm=4.465, loss_scale=1.000, learning_rate=8.375e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 12:16:00,910 (deepspeed_trainer:228) INFO: 23epoch:train:12201-12300batch: iter_time=1.082e-04, loss_ctc=65.738, loss_att=47.645, acc=0.715, loss=53.075, grad_norm=4.255, loss_scale=1.000, learning_rate=8.374e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 12:16:38,066 (deepspeed_trainer:228) INFO: 23epoch:train:12301-12400batch: iter_time=1.082e-04, loss_ctc=76.127, loss_att=62.516, acc=0.697, loss=66.580, grad_norm=4.698, loss_scale=1.000, learning_rate=8.373e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:17:14,932 (deepspeed_trainer:228) INFO: 23epoch:train:12401-12500batch: iter_time=1.075e-04, loss_ctc=72.809, loss_att=57.443, acc=0.708, loss=62.042, grad_norm=4.886, loss_scale=1.000, learning_rate=8.372e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 12:17:52,466 (deepspeed_trainer:228) INFO: 23epoch:train:12501-12600batch: iter_time=1.078e-04, loss_ctc=79.116, loss_att=54.417, acc=0.714, loss=61.816, grad_norm=5.743, loss_scale=1.000, learning_rate=8.370e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 12:18:30,054 (deepspeed_trainer:228) INFO: 23epoch:train:12601-12700batch: iter_time=1.071e-04, loss_ctc=78.331, loss_att=59.041, acc=0.725, loss=64.791, grad_norm=5.088, loss_scale=1.000, learning_rate=8.369e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 12:19:07,468 (deepspeed_trainer:228) INFO: 23epoch:train:12701-12800batch: iter_time=1.074e-04, loss_ctc=73.510, loss_att=55.144, acc=0.713, loss=60.644, grad_norm=4.622, loss_scale=1.000, learning_rate=8.368e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:19:45,215 (deepspeed_trainer:228) INFO: 23epoch:train:12801-12900batch: iter_time=1.087e-04, loss_ctc=71.734, loss_att=55.403, acc=0.709, loss=60.321, grad_norm=4.783, loss_scale=1.000, learning_rate=8.367e-05, step_time=0.377 [2024-12-07 12:20:23,037] [INFO] [logging.py:129:log_dist] [Rank 0] step=343000, skipped=0, lr=[np.float64(8.364845574486731e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:20:23,037] [INFO] [timer.py:264:stop] epoch=0/micro_step=88000/global_step=88000, RunningAvgSamplesPerSec=43.82688735056313, CurrSamplesPerSec=44.174451852730435, MemAllocated=2.04GB, MaxMemAllocated=19.08GB [cnode7-012:0/16] 2024-12-07 12:20:23,039 (deepspeed_trainer:228) INFO: 23epoch:train:12901-13000batch: iter_time=1.060e-04, loss_ctc=72.198, loss_att=54.849, acc=0.716, loss=60.023, grad_norm=4.925, loss_scale=1.000, learning_rate=8.365e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 12:20:59,623 (deepspeed_trainer:228) INFO: 23epoch:train:13001-13100batch: iter_time=1.082e-04, loss_ctc=75.355, loss_att=65.706, acc=0.688, loss=68.617, grad_norm=5.092, loss_scale=1.000, learning_rate=8.364e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:21:13,270 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 12:21:39,499 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 12:21:55,061 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 12:21:55,061 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 12:21:55,064 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 12:22:21,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:21,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:19,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:20,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:23,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:20,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:20,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:20,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:24,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:21,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:21,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:22,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:25,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:26,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:26,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:22:26,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:09,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:10,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:08,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:09,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:09,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:09,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:09,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:10,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:10,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:10,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:14,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:14,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:15,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:15,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:15,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:16,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:57,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:58,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:56,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:58,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:59,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:59,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:59,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:59,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:59,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:23:59,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:03,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:04,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:04,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:04,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:06,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:06,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:46,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:46,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:44,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:46,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:46,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:47,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:47,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:47,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:48,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:53,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:51,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:54,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:55,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:55,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:55,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:24:57,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 12:25:50,963 (deepspeed_trainer:228) INFO: 23epoch:train:13101-13200batch: iter_time=2.515, loss_ctc=64.108, loss_att=50.691, acc=0.716, loss=54.717, grad_norm=4.905, loss_scale=1.000, learning_rate=8.363e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-07 12:26:28,541 (deepspeed_trainer:228) INFO: 23epoch:train:13201-13300batch: iter_time=1.067e-04, loss_ctc=74.854, loss_att=63.117, acc=0.706, loss=66.644, grad_norm=4.303, loss_scale=1.000, learning_rate=8.362e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 12:27:05,592 (deepspeed_trainer:228) INFO: 23epoch:train:13301-13400batch: iter_time=1.095e-04, loss_ctc=63.059, loss_att=50.351, acc=0.716, loss=54.136, grad_norm=4.436, loss_scale=1.000, learning_rate=8.361e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 12:27:42,845 (deepspeed_trainer:228) INFO: 23epoch:train:13401-13500batch: iter_time=1.109e-04, loss_ctc=77.843, loss_att=60.591, acc=0.703, loss=65.760, grad_norm=5.363, loss_scale=1.000, learning_rate=8.359e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 12:28:20,237 (deepspeed_trainer:228) INFO: 23epoch:train:13501-13600batch: iter_time=1.092e-04, loss_ctc=72.448, loss_att=53.828, acc=0.714, loss=59.390, grad_norm=4.336, loss_scale=1.000, learning_rate=8.358e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:28:57,830 (deepspeed_trainer:228) INFO: 23epoch:train:13601-13700batch: iter_time=1.071e-04, loss_ctc=75.329, loss_att=59.804, acc=0.706, loss=64.456, grad_norm=4.768, loss_scale=1.000, learning_rate=8.357e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 12:29:35,889 (deepspeed_trainer:228) INFO: 23epoch:train:13701-13800batch: iter_time=1.079e-04, loss_ctc=69.733, loss_att=55.984, acc=0.713, loss=60.096, grad_norm=4.973, loss_scale=1.000, learning_rate=8.356e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 12:30:13,140 (deepspeed_trainer:228) INFO: 23epoch:train:13801-13900batch: iter_time=1.116e-04, loss_ctc=68.045, loss_att=51.066, acc=0.712, loss=56.191, grad_norm=4.466, loss_scale=1.000, learning_rate=8.354e-05, step_time=0.372 [2024-12-07 12:30:50,748] [INFO] [logging.py:129:log_dist] [Rank 0] step=344000, skipped=0, lr=[np.float64(8.352678555333234e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:30:50,749] [INFO] [timer.py:264:stop] epoch=0/micro_step=89000/global_step=89000, RunningAvgSamplesPerSec=43.824408087116296, CurrSamplesPerSec=44.82892459484652, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 12:30:50,750 (deepspeed_trainer:228) INFO: 23epoch:train:13901-14000batch: iter_time=1.062e-04, loss_ctc=89.672, loss_att=67.914, acc=0.700, loss=74.476, grad_norm=6.137, loss_scale=1.000, learning_rate=8.353e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 12:31:27,684 (deepspeed_trainer:228) INFO: 23epoch:train:14001-14100batch: iter_time=1.086e-04, loss_ctc=65.228, loss_att=47.897, acc=0.726, loss=53.110, grad_norm=4.240, loss_scale=1.000, learning_rate=8.352e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 12:32:04,903 (deepspeed_trainer:228) INFO: 23epoch:train:14101-14200batch: iter_time=1.106e-04, loss_ctc=68.420, loss_att=50.679, acc=0.717, loss=55.982, grad_norm=4.452, loss_scale=1.000, learning_rate=8.351e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 12:32:42,221 (deepspeed_trainer:228) INFO: 23epoch:train:14201-14300batch: iter_time=1.116e-04, loss_ctc=75.634, loss_att=63.656, acc=0.705, loss=67.282, grad_norm=4.697, loss_scale=1.000, learning_rate=8.350e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:33:19,327 (deepspeed_trainer:228) INFO: 23epoch:train:14301-14400batch: iter_time=1.130e-04, loss_ctc=78.251, loss_att=57.531, acc=0.716, loss=63.736, grad_norm=5.483, loss_scale=1.000, learning_rate=8.348e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:33:56,520 (deepspeed_trainer:228) INFO: 23epoch:train:14401-14500batch: iter_time=1.108e-04, loss_ctc=76.914, loss_att=55.368, acc=0.723, loss=61.833, grad_norm=4.659, loss_scale=1.000, learning_rate=8.347e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:34:33,696 (deepspeed_trainer:228) INFO: 23epoch:train:14501-14600batch: iter_time=1.122e-04, loss_ctc=72.613, loss_att=55.740, acc=0.727, loss=60.798, grad_norm=4.222, loss_scale=1.000, learning_rate=8.346e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 12:35:10,718 (deepspeed_trainer:228) INFO: 23epoch:train:14601-14700batch: iter_time=1.090e-04, loss_ctc=75.579, loss_att=58.208, acc=0.704, loss=63.464, grad_norm=4.442, loss_scale=1.000, learning_rate=8.345e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 12:35:47,832 (deepspeed_trainer:228) INFO: 23epoch:train:14701-14800batch: iter_time=1.103e-04, loss_ctc=75.456, loss_att=56.303, acc=0.721, loss=62.054, grad_norm=4.688, loss_scale=1.000, learning_rate=8.344e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:36:26,086 (deepspeed_trainer:228) INFO: 23epoch:train:14801-14900batch: iter_time=1.099e-04, loss_ctc=73.442, loss_att=63.553, acc=0.706, loss=66.553, grad_norm=5.479, loss_scale=1.000, learning_rate=8.342e-05, step_time=0.382 [2024-12-07 12:37:03,088] [INFO] [logging.py:129:log_dist] [Rank 0] step=345000, skipped=0, lr=[np.float64(8.34056447451715e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:37:03,089] [INFO] [timer.py:264:stop] epoch=0/micro_step=90000/global_step=90000, RunningAvgSamplesPerSec=43.827168887463806, CurrSamplesPerSec=46.931369572517355, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 12:37:03,091 (deepspeed_trainer:228) INFO: 23epoch:train:14901-15000batch: iter_time=1.062e-04, loss_ctc=69.039, loss_att=57.096, acc=0.699, loss=60.674, grad_norm=4.366, loss_scale=1.000, learning_rate=8.341e-05, step_time=0.370 [2024-12-07 12:37:19,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:19,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:16,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:19,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:16,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:19,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:17,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:20,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:20,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:17,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:17,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:20,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:20,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:17,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:17,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:17,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:32,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:32,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:35,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:35,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:32,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:36,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:36,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:33,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:33,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:34,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:37,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:34,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:37,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:34,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:37,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:37,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:47,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:50,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:48,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:51,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:48,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:51,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:49,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:52,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:49,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:49,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:53,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:50,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:53,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:53,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:50,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:37:53,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:05,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:03,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:04,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:07,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:04,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:07,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:04,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:08,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:05,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:05,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:08,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:08,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:05,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:06,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:09,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:09,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:38:21,962] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 23 is about to be saved! [2024-12-07 12:38:21,992] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/mp_rank_00_model_states.pt [2024-12-07 12:38:21,992] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/mp_rank_00_model_states.pt... [2024-12-07 12:38:23,880] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/mp_rank_00_model_states.pt. [2024-12-07 12:38:21,046] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,046] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,047] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,040] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,041] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,043] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,051] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,051] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,052] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,045] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,045] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,053] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 12:38:21,054] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,046] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,046] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,046] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 12:38:24,656] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,656] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,656] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,810] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,814] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,814] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,828] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,828] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,829] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,828] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,829] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,829] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,829] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,829] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,829] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,832] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,832] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,832] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,855] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,855] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,855] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,857] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,857] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,858] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,850] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,850] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,851] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,856] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,856] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,856] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,865] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,865] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,865] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,878] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,878] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,878] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,886] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,887] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,887] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,913] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,913] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,914] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:21,916] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 12:38:21,916] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 12:38:21,916] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [2024-12-07 12:38:24,917] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 12:38:24,917] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_23/23/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 12:38:24,917] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 23 is ready now! [cnode7-012:0/16] 2024-12-07 12:38:24,957 (deepspeed_trainer:158) INFO: 23epoch results: [train] iter_time=0.135, loss_ctc=73.793, loss_att=57.004, acc=0.708, loss=62.040, grad_norm=4.864, loss_scale=1.000, learning_rate=8.433e-05, step_time=0.375, time=2 hours, 7 minutes and 32.83 seconds, total_count=345023, gpu_max_cached_mem_GB=27.986, [valid] loss_ctc=4.344, cer_ctc=0.127, loss_att=7.875, acc=0.776, cer=0.359, wer=1.000, loss=6.812, time=1 minute and 9.1 seconds, total_count=23, gpu_max_cached_mem_GB=27.986 [cnode7-012:0/16] 2024-12-07 12:38:27,247 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 12:38:54,275 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 12:39:10,456 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 12:39:10,456 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 12:39:10,458 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 12:39:26,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:29,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:27,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:27,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:28,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:28,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:28,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:31,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:29,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:32,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:32,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:33,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:30,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:33,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:33,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:39:34,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:13,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:15,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:15,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:18,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:15,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:16,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:16,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:17,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:21,000] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:18,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:21,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:23,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:23,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:24,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:24,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:40:26,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:00,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:04,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:02,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:02,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:03,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:03,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:04,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:07,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:05,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:09,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:06,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:11,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:11,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:12,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:13,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:16,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:47,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:52,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:50,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:50,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:51,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:52,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:54,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:57,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:55,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:58,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:59,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:41:56,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:42:01,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:42:02,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:42:03,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:42:06,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 12:43:10,811 (deepspeed_trainer:228) INFO: 24epoch:train:1-100batch: iter_time=2.394, loss_ctc=79.681, loss_att=58.454, acc=0.712, loss=64.782, grad_norm=4.698, loss_scale=1.000, learning_rate=8.340e-05, step_time=0.442 [cnode7-012:0/16] 2024-12-07 12:43:47,914 (deepspeed_trainer:228) INFO: 24epoch:train:101-200batch: iter_time=1.108e-04, loss_ctc=70.342, loss_att=51.790, acc=0.702, loss=57.392, grad_norm=4.860, loss_scale=1.000, learning_rate=8.339e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:44:25,346 (deepspeed_trainer:228) INFO: 24epoch:train:201-300batch: iter_time=1.077e-04, loss_ctc=76.175, loss_att=56.180, acc=0.719, loss=62.152, grad_norm=4.744, loss_scale=1.000, learning_rate=8.338e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:45:02,367 (deepspeed_trainer:228) INFO: 24epoch:train:301-400batch: iter_time=1.054e-04, loss_ctc=69.695, loss_att=49.521, acc=0.713, loss=55.561, grad_norm=4.759, loss_scale=1.000, learning_rate=8.336e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 12:45:39,807 (deepspeed_trainer:228) INFO: 24epoch:train:401-500batch: iter_time=1.110e-04, loss_ctc=78.601, loss_att=58.222, acc=0.717, loss=64.329, grad_norm=4.955, loss_scale=1.000, learning_rate=8.335e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:46:16,960 (deepspeed_trainer:228) INFO: 24epoch:train:501-600batch: iter_time=1.088e-04, loss_ctc=72.706, loss_att=55.872, acc=0.707, loss=60.941, grad_norm=4.504, loss_scale=1.000, learning_rate=8.334e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 12:46:54,302 (deepspeed_trainer:228) INFO: 24epoch:train:601-700batch: iter_time=1.135e-04, loss_ctc=73.747, loss_att=57.389, acc=0.704, loss=62.294, grad_norm=5.268, loss_scale=1.000, learning_rate=8.333e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:47:31,604 (deepspeed_trainer:228) INFO: 24epoch:train:701-800batch: iter_time=1.082e-04, loss_ctc=73.286, loss_att=54.788, acc=0.705, loss=60.345, grad_norm=5.462, loss_scale=1.000, learning_rate=8.332e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 12:48:09,135 (deepspeed_trainer:228) INFO: 24epoch:train:801-900batch: iter_time=1.081e-04, loss_ctc=80.190, loss_att=60.132, acc=0.712, loss=66.147, grad_norm=5.377, loss_scale=1.000, learning_rate=8.330e-05, step_time=0.375 [2024-12-07 12:48:46,626] [INFO] [logging.py:129:log_dist] [Rank 0] step=346000, skipped=0, lr=[np.float64(8.3285029492601e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:48:46,627] [INFO] [timer.py:264:stop] epoch=0/micro_step=91000/global_step=91000, RunningAvgSamplesPerSec=43.81967764641911, CurrSamplesPerSec=42.93375808759447, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 12:48:46,628 (deepspeed_trainer:228) INFO: 24epoch:train:901-1000batch: iter_time=1.070e-04, loss_ctc=86.721, loss_att=73.564, acc=0.683, loss=77.508, grad_norm=5.612, loss_scale=1.000, learning_rate=8.329e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:49:24,008 (deepspeed_trainer:228) INFO: 24epoch:train:1001-1100batch: iter_time=1.147e-04, loss_ctc=70.078, loss_att=56.626, acc=0.707, loss=60.660, grad_norm=4.350, loss_scale=1.000, learning_rate=8.328e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:50:01,479 (deepspeed_trainer:228) INFO: 24epoch:train:1101-1200batch: iter_time=1.066e-04, loss_ctc=69.711, loss_att=51.656, acc=0.714, loss=57.064, grad_norm=4.783, loss_scale=1.000, learning_rate=8.327e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 12:50:39,272 (deepspeed_trainer:228) INFO: 24epoch:train:1201-1300batch: iter_time=1.079e-04, loss_ctc=79.914, loss_att=62.106, acc=0.698, loss=67.465, grad_norm=5.447, loss_scale=1.000, learning_rate=8.325e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 12:51:16,229 (deepspeed_trainer:228) INFO: 24epoch:train:1301-1400batch: iter_time=1.100e-04, loss_ctc=80.691, loss_att=50.431, acc=0.714, loss=59.505, grad_norm=5.284, loss_scale=1.000, learning_rate=8.324e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 12:51:53,832 (deepspeed_trainer:228) INFO: 24epoch:train:1401-1500batch: iter_time=1.052e-04, loss_ctc=84.627, loss_att=68.732, acc=0.696, loss=73.532, grad_norm=4.961, loss_scale=1.000, learning_rate=8.323e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 12:52:31,207 (deepspeed_trainer:228) INFO: 24epoch:train:1501-1600batch: iter_time=1.085e-04, loss_ctc=81.494, loss_att=63.171, acc=0.698, loss=68.719, grad_norm=5.488, loss_scale=1.000, learning_rate=8.322e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 12:53:08,999 (deepspeed_trainer:228) INFO: 24epoch:train:1601-1700batch: iter_time=1.065e-04, loss_ctc=83.499, loss_att=66.393, acc=0.702, loss=71.518, grad_norm=5.209, loss_scale=1.000, learning_rate=8.321e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 12:53:46,429 (deepspeed_trainer:228) INFO: 24epoch:train:1701-1800batch: iter_time=1.062e-04, loss_ctc=73.261, loss_att=49.622, acc=0.725, loss=56.698, grad_norm=4.788, loss_scale=1.000, learning_rate=8.319e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 12:54:20,108 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 12:54:46,812 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 12:55:04,306 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 12:55:04,307 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 12:55:04,309 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 12:55:30,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:30,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:30,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:30,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:28,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:32,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:28,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:29,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:29,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:29,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:29,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:29,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:29,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:33,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:33,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:55:33,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:17,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:21,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:21,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:18,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:21,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:18,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:22,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:22,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:19,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:19,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:19,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:19,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:19,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:22,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:23,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:56:23,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:04,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:06,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:06,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:09,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:09,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:10,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:07,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:07,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:10,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:08,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:08,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:11,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:08,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:12,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:12,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:13,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:53,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:54,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:54,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:54,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:55,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:55,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:58,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:59,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:56,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:59,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:57:57,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:58:01,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:58:01,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:58:02,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:58:03,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 12:58:04,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 12:58:41,043 (deepspeed_trainer:228) INFO: 24epoch:train:1801-1900batch: iter_time=2.567, loss_ctc=77.159, loss_att=59.888, acc=0.694, loss=65.064, grad_norm=5.013, loss_scale=1.000, learning_rate=8.318e-05, step_time=0.379 [2024-12-07 12:59:18,694] [INFO] [logging.py:129:log_dist] [Rank 0] step=347000, skipped=0, lr=[np.float64(8.316493600647368e-05)], mom=[[0.9, 0.98]] [2024-12-07 12:59:18,694] [INFO] [timer.py:264:stop] epoch=0/micro_step=92000/global_step=92000, RunningAvgSamplesPerSec=43.818554394278436, CurrSamplesPerSec=43.885230398396665, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 12:59:18,696 (deepspeed_trainer:228) INFO: 24epoch:train:1901-2000batch: iter_time=1.089e-04, loss_ctc=77.931, loss_att=58.468, acc=0.701, loss=64.327, grad_norm=4.822, loss_scale=1.000, learning_rate=8.317e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 12:59:55,949 (deepspeed_trainer:228) INFO: 24epoch:train:2001-2100batch: iter_time=1.122e-04, loss_ctc=67.654, loss_att=50.751, acc=0.707, loss=55.801, grad_norm=4.451, loss_scale=1.000, learning_rate=8.316e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:00:33,549 (deepspeed_trainer:228) INFO: 24epoch:train:2101-2200batch: iter_time=1.122e-04, loss_ctc=80.999, loss_att=58.153, acc=0.705, loss=65.005, grad_norm=4.713, loss_scale=1.000, learning_rate=8.315e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:01:11,007 (deepspeed_trainer:228) INFO: 24epoch:train:2201-2300batch: iter_time=1.220e-04, loss_ctc=66.192, loss_att=48.679, acc=0.722, loss=53.915, grad_norm=4.529, loss_scale=1.000, learning_rate=8.313e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 13:01:48,688 (deepspeed_trainer:228) INFO: 24epoch:train:2301-2400batch: iter_time=1.155e-04, loss_ctc=79.002, loss_att=59.357, acc=0.704, loss=65.250, grad_norm=4.711, loss_scale=1.000, learning_rate=8.312e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 13:02:26,055 (deepspeed_trainer:228) INFO: 24epoch:train:2401-2500batch: iter_time=1.182e-04, loss_ctc=69.606, loss_att=52.528, acc=0.707, loss=57.676, grad_norm=4.732, loss_scale=1.000, learning_rate=8.311e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:03:03,598 (deepspeed_trainer:228) INFO: 24epoch:train:2501-2600batch: iter_time=1.229e-04, loss_ctc=72.192, loss_att=54.883, acc=0.705, loss=60.055, grad_norm=5.109, loss_scale=1.000, learning_rate=8.310e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:03:40,861 (deepspeed_trainer:228) INFO: 24epoch:train:2601-2700batch: iter_time=1.139e-04, loss_ctc=76.616, loss_att=60.028, acc=0.707, loss=64.985, grad_norm=4.412, loss_scale=1.000, learning_rate=8.309e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:04:17,872 (deepspeed_trainer:228) INFO: 24epoch:train:2701-2800batch: iter_time=1.170e-04, loss_ctc=77.809, loss_att=56.401, acc=0.701, loss=62.837, grad_norm=5.463, loss_scale=1.000, learning_rate=8.308e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 13:04:55,058 (deepspeed_trainer:228) INFO: 24epoch:train:2801-2900batch: iter_time=1.196e-04, loss_ctc=79.714, loss_att=65.757, acc=0.689, loss=69.966, grad_norm=5.104, loss_scale=1.000, learning_rate=8.306e-05, step_time=0.372 [2024-12-07 13:05:32,237] [INFO] [logging.py:129:log_dist] [Rank 0] step=348000, skipped=0, lr=[np.float64(8.304536053577894e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:05:32,237] [INFO] [timer.py:264:stop] epoch=0/micro_step=93000/global_step=93000, RunningAvgSamplesPerSec=43.820190714566124, CurrSamplesPerSec=46.899227409080986, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:05:32,239 (deepspeed_trainer:228) INFO: 24epoch:train:2901-3000batch: iter_time=1.128e-04, loss_ctc=68.489, loss_att=55.194, acc=0.700, loss=59.188, grad_norm=4.788, loss_scale=1.000, learning_rate=8.305e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:06:09,366 (deepspeed_trainer:228) INFO: 24epoch:train:3001-3100batch: iter_time=1.120e-04, loss_ctc=78.438, loss_att=59.081, acc=0.694, loss=64.905, grad_norm=5.347, loss_scale=1.000, learning_rate=8.304e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:06:46,616 (deepspeed_trainer:228) INFO: 24epoch:train:3101-3200batch: iter_time=1.088e-04, loss_ctc=82.596, loss_att=58.434, acc=0.701, loss=65.692, grad_norm=4.841, loss_scale=1.000, learning_rate=8.303e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:07:23,449 (deepspeed_trainer:228) INFO: 24epoch:train:3201-3300batch: iter_time=1.134e-04, loss_ctc=69.007, loss_att=47.068, acc=0.722, loss=53.664, grad_norm=4.565, loss_scale=1.000, learning_rate=8.302e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 13:08:01,223 (deepspeed_trainer:228) INFO: 24epoch:train:3301-3400batch: iter_time=1.223e-04, loss_ctc=86.474, loss_att=67.661, acc=0.690, loss=73.316, grad_norm=5.355, loss_scale=1.000, learning_rate=8.300e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 13:08:39,451 (deepspeed_trainer:228) INFO: 24epoch:train:3401-3500batch: iter_time=1.135e-04, loss_ctc=84.406, loss_att=64.303, acc=0.697, loss=70.355, grad_norm=5.477, loss_scale=1.000, learning_rate=8.299e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 13:09:16,745 (deepspeed_trainer:228) INFO: 24epoch:train:3501-3600batch: iter_time=1.162e-04, loss_ctc=78.478, loss_att=60.591, acc=0.703, loss=65.966, grad_norm=4.829, loss_scale=1.000, learning_rate=8.298e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:09:53,754 (deepspeed_trainer:228) INFO: 24epoch:train:3601-3700batch: iter_time=1.215e-04, loss_ctc=68.959, loss_att=46.912, acc=0.718, loss=53.512, grad_norm=5.029, loss_scale=1.000, learning_rate=8.297e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:10:16,712 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 13:10:43,691 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 13:10:59,240 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 13:10:59,240 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 13:10:59,242 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 13:11:23,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:24,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:24,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:26,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:24,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:24,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:28,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:28,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:24,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:25,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:25,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:25,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:25,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:28,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:25,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:11:29,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:11,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:12,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:12,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:15,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:13,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:16,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:13,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:14,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:14,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:18,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:18,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:15,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:18,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:15,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:15,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:15,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:57,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:12:59,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:00,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:04,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:04,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:01,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:02,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:02,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:02,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:02,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:06,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:06,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:03,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:06,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:04,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:04,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:45,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:47,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:48,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:48,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:49,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:52,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:53,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:51,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:51,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:54,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:51,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:51,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:55,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:55,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:52,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:13:52,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 13:14:42,362 (deepspeed_trainer:228) INFO: 24epoch:train:3701-3800batch: iter_time=2.482, loss_ctc=82.324, loss_att=64.213, acc=0.695, loss=69.653, grad_norm=5.194, loss_scale=1.000, learning_rate=8.296e-05, step_time=0.403 [cnode7-012:0/16] 2024-12-07 13:15:19,918 (deepspeed_trainer:228) INFO: 24epoch:train:3801-3900batch: iter_time=1.054e-04, loss_ctc=77.874, loss_att=57.331, acc=0.710, loss=63.487, grad_norm=4.574, loss_scale=1.000, learning_rate=8.294e-05, step_time=0.375 [2024-12-07 13:15:57,196] [INFO] [logging.py:129:log_dist] [Rank 0] step=349000, skipped=0, lr=[np.float64(8.292629936715068e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:15:57,199] [INFO] [timer.py:264:stop] epoch=0/micro_step=94000/global_step=94000, RunningAvgSamplesPerSec=43.81749775438242, CurrSamplesPerSec=43.82212750568459, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:15:57,201 (deepspeed_trainer:228) INFO: 24epoch:train:3901-4000batch: iter_time=1.055e-04, loss_ctc=67.804, loss_att=51.155, acc=0.717, loss=56.162, grad_norm=4.702, loss_scale=1.000, learning_rate=8.293e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:16:34,757 (deepspeed_trainer:228) INFO: 24epoch:train:4001-4100batch: iter_time=1.080e-04, loss_ctc=77.052, loss_att=55.156, acc=0.711, loss=61.759, grad_norm=5.335, loss_scale=1.000, learning_rate=8.292e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:17:11,966 (deepspeed_trainer:228) INFO: 24epoch:train:4101-4200batch: iter_time=1.080e-04, loss_ctc=65.484, loss_att=48.263, acc=0.726, loss=53.385, grad_norm=4.607, loss_scale=1.000, learning_rate=8.291e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:17:49,349 (deepspeed_trainer:228) INFO: 24epoch:train:4201-4300batch: iter_time=1.084e-04, loss_ctc=82.584, loss_att=63.014, acc=0.710, loss=68.866, grad_norm=4.753, loss_scale=1.000, learning_rate=8.290e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 13:18:26,173 (deepspeed_trainer:228) INFO: 24epoch:train:4301-4400batch: iter_time=1.084e-04, loss_ctc=68.039, loss_att=52.752, acc=0.709, loss=57.331, grad_norm=4.746, loss_scale=1.000, learning_rate=8.288e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 13:19:03,129 (deepspeed_trainer:228) INFO: 24epoch:train:4401-4500batch: iter_time=1.068e-04, loss_ctc=71.843, loss_att=56.617, acc=0.710, loss=61.188, grad_norm=4.784, loss_scale=1.000, learning_rate=8.287e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 13:19:40,180 (deepspeed_trainer:228) INFO: 24epoch:train:4501-4600batch: iter_time=1.068e-04, loss_ctc=74.282, loss_att=54.716, acc=0.721, loss=60.597, grad_norm=4.670, loss_scale=1.000, learning_rate=8.286e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:20:17,363 (deepspeed_trainer:228) INFO: 24epoch:train:4601-4700batch: iter_time=1.078e-04, loss_ctc=81.093, loss_att=63.539, acc=0.688, loss=68.762, grad_norm=5.871, loss_scale=1.000, learning_rate=8.285e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:20:54,573 (deepspeed_trainer:228) INFO: 24epoch:train:4701-4800batch: iter_time=1.076e-04, loss_ctc=74.306, loss_att=63.477, acc=0.709, loss=66.723, grad_norm=4.895, loss_scale=1.000, learning_rate=8.284e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:21:31,503 (deepspeed_trainer:228) INFO: 24epoch:train:4801-4900batch: iter_time=1.069e-04, loss_ctc=66.014, loss_att=50.318, acc=0.714, loss=55.054, grad_norm=4.524, loss_scale=1.000, learning_rate=8.283e-05, step_time=0.369 [2024-12-07 13:22:08,589] [INFO] [logging.py:129:log_dist] [Rank 0] step=350000, skipped=0, lr=[np.float64(8.280774882438296e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:22:08,589] [INFO] [timer.py:264:stop] epoch=0/micro_step=95000/global_step=95000, RunningAvgSamplesPerSec=43.821316062084996, CurrSamplesPerSec=44.67410203898934, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:22:08,591 (deepspeed_trainer:228) INFO: 24epoch:train:4901-5000batch: iter_time=1.118e-04, loss_ctc=78.834, loss_att=59.172, acc=0.697, loss=65.052, grad_norm=5.440, loss_scale=1.000, learning_rate=8.281e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:22:45,802 (deepspeed_trainer:228) INFO: 24epoch:train:5001-5100batch: iter_time=1.088e-04, loss_ctc=83.029, loss_att=59.340, acc=0.710, loss=66.444, grad_norm=5.271, loss_scale=1.000, learning_rate=8.280e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:23:22,791 (deepspeed_trainer:228) INFO: 24epoch:train:5101-5200batch: iter_time=1.119e-04, loss_ctc=69.811, loss_att=53.669, acc=0.710, loss=58.541, grad_norm=4.680, loss_scale=1.000, learning_rate=8.279e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:24:01,468 (deepspeed_trainer:228) INFO: 24epoch:train:5201-5300batch: iter_time=1.432e-04, loss_ctc=83.339, loss_att=67.590, acc=0.699, loss=72.331, grad_norm=5.171, loss_scale=1.000, learning_rate=8.278e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-07 13:24:39,090 (deepspeed_trainer:228) INFO: 24epoch:train:5301-5400batch: iter_time=1.130e-04, loss_ctc=88.778, loss_att=65.541, acc=0.702, loss=72.513, grad_norm=5.808, loss_scale=1.000, learning_rate=8.277e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 13:25:16,568 (deepspeed_trainer:228) INFO: 24epoch:train:5401-5500batch: iter_time=1.143e-04, loss_ctc=75.315, loss_att=60.118, acc=0.720, loss=64.664, grad_norm=4.504, loss_scale=1.000, learning_rate=8.275e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:25:53,501 (deepspeed_trainer:228) INFO: 24epoch:train:5501-5600batch: iter_time=1.105e-04, loss_ctc=68.772, loss_att=49.227, acc=0.704, loss=55.072, grad_norm=4.841, loss_scale=1.000, learning_rate=8.274e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 13:26:07,233 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 13:26:33,359 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 13:26:49,933 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 13:26:49,933 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 13:26:49,935 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 13:27:12,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:12,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:16,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:16,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:16,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:13,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:14,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:14,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:14,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:18,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:15,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:15,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:19,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:20,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:20,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:27:20,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:02,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:02,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:05,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:02,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:06,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:03,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:04,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:04,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:04,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:04,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:08,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:08,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:08,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:09,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:09,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:13,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:50,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:50,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:50,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:50,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:54,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:52,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:55,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:52,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:53,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:54,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:59,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:59,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:59,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:59,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:28:59,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:03,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:38,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:38,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:42,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:39,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:39,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:42,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:40,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:41,383] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:41,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:41,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:48,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:48,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:48,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:48,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:49,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:29:52,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 13:30:47,023 (deepspeed_trainer:228) INFO: 24epoch:train:5601-5700batch: iter_time=2.529, loss_ctc=82.901, loss_att=62.290, acc=0.707, loss=68.448, grad_norm=5.274, loss_scale=1.000, learning_rate=8.273e-05, step_time=0.406 [cnode7-012:0/16] 2024-12-07 13:31:24,329 (deepspeed_trainer:228) INFO: 24epoch:train:5701-5800batch: iter_time=1.065e-04, loss_ctc=68.743, loss_att=50.877, acc=0.712, loss=56.223, grad_norm=5.476, loss_scale=1.000, learning_rate=8.272e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:32:02,390 (deepspeed_trainer:228) INFO: 24epoch:train:5801-5900batch: iter_time=1.040e-04, loss_ctc=76.459, loss_att=56.216, acc=0.720, loss=62.307, grad_norm=5.031, loss_scale=1.000, learning_rate=8.271e-05, step_time=0.380 [2024-12-07 13:32:39,438] [INFO] [logging.py:129:log_dist] [Rank 0] step=351000, skipped=0, lr=[np.float64(8.268970526795326e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:32:39,438] [INFO] [timer.py:264:stop] epoch=0/micro_step=96000/global_step=96000, RunningAvgSamplesPerSec=43.81752578874873, CurrSamplesPerSec=45.01989083443194, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:32:39,440 (deepspeed_trainer:228) INFO: 24epoch:train:5901-6000batch: iter_time=1.054e-04, loss_ctc=69.796, loss_att=48.369, acc=0.721, loss=54.816, grad_norm=5.027, loss_scale=1.000, learning_rate=8.270e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:33:16,389 (deepspeed_trainer:228) INFO: 24epoch:train:6001-6100batch: iter_time=1.099e-04, loss_ctc=70.253, loss_att=53.009, acc=0.722, loss=58.184, grad_norm=4.504, loss_scale=1.000, learning_rate=8.268e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 13:33:53,661 (deepspeed_trainer:228) INFO: 24epoch:train:6101-6200batch: iter_time=1.080e-04, loss_ctc=75.832, loss_att=57.169, acc=0.716, loss=62.763, grad_norm=4.609, loss_scale=1.000, learning_rate=8.267e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:34:30,868 (deepspeed_trainer:228) INFO: 24epoch:train:6201-6300batch: iter_time=1.078e-04, loss_ctc=72.499, loss_att=56.340, acc=0.706, loss=61.203, grad_norm=5.083, loss_scale=1.000, learning_rate=8.266e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:35:08,380 (deepspeed_trainer:228) INFO: 24epoch:train:6301-6400batch: iter_time=1.101e-04, loss_ctc=69.208, loss_att=53.555, acc=0.710, loss=58.267, grad_norm=5.311, loss_scale=1.000, learning_rate=8.265e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:35:45,968 (deepspeed_trainer:228) INFO: 24epoch:train:6401-6500batch: iter_time=1.089e-04, loss_ctc=77.434, loss_att=58.394, acc=0.719, loss=64.093, grad_norm=4.366, loss_scale=1.000, learning_rate=8.264e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:36:23,637 (deepspeed_trainer:228) INFO: 24epoch:train:6501-6600batch: iter_time=1.057e-04, loss_ctc=84.266, loss_att=68.986, acc=0.690, loss=73.610, grad_norm=5.621, loss_scale=1.000, learning_rate=8.262e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 13:37:00,710 (deepspeed_trainer:228) INFO: 24epoch:train:6601-6700batch: iter_time=1.099e-04, loss_ctc=69.062, loss_att=55.992, acc=0.711, loss=59.915, grad_norm=5.123, loss_scale=1.000, learning_rate=8.261e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:37:37,876 (deepspeed_trainer:228) INFO: 24epoch:train:6701-6800batch: iter_time=1.151e-04, loss_ctc=69.395, loss_att=53.743, acc=0.715, loss=58.439, grad_norm=4.546, loss_scale=1.000, learning_rate=8.260e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:38:15,172 (deepspeed_trainer:228) INFO: 24epoch:train:6801-6900batch: iter_time=1.084e-04, loss_ctc=77.909, loss_att=58.591, acc=0.706, loss=64.404, grad_norm=5.135, loss_scale=1.000, learning_rate=8.259e-05, step_time=0.372 [2024-12-07 13:38:52,198] [INFO] [logging.py:129:log_dist] [Rank 0] step=352000, skipped=0, lr=[np.float64(8.257216509455311e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:38:52,200] [INFO] [timer.py:264:stop] epoch=0/micro_step=97000/global_step=97000, RunningAvgSamplesPerSec=43.81957042743927, CurrSamplesPerSec=43.87883160020708, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:38:52,202 (deepspeed_trainer:228) INFO: 24epoch:train:6901-7000batch: iter_time=1.080e-04, loss_ctc=76.171, loss_att=52.162, acc=0.714, loss=59.343, grad_norm=4.846, loss_scale=1.000, learning_rate=8.258e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:39:29,292 (deepspeed_trainer:228) INFO: 24epoch:train:7001-7100batch: iter_time=1.122e-04, loss_ctc=73.985, loss_att=57.804, acc=0.708, loss=62.659, grad_norm=4.571, loss_scale=1.000, learning_rate=8.257e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:40:07,220 (deepspeed_trainer:228) INFO: 24epoch:train:7101-7200batch: iter_time=1.089e-04, loss_ctc=83.849, loss_att=67.594, acc=0.704, loss=72.445, grad_norm=5.174, loss_scale=1.000, learning_rate=8.255e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 13:40:44,598 (deepspeed_trainer:228) INFO: 24epoch:train:7201-7300batch: iter_time=1.086e-04, loss_ctc=87.235, loss_att=67.915, acc=0.698, loss=73.701, grad_norm=4.985, loss_scale=1.000, learning_rate=8.254e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:41:21,765 (deepspeed_trainer:228) INFO: 24epoch:train:7301-7400batch: iter_time=1.106e-04, loss_ctc=74.276, loss_att=53.152, acc=0.725, loss=59.483, grad_norm=4.943, loss_scale=1.000, learning_rate=8.253e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:41:58,681 (deepspeed_trainer:228) INFO: 24epoch:train:7401-7500batch: iter_time=1.063e-04, loss_ctc=73.119, loss_att=54.916, acc=0.703, loss=60.374, grad_norm=4.805, loss_scale=1.000, learning_rate=8.252e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 13:42:03,035 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 13:42:29,799 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 13:42:47,892 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 13:42:47,892 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 13:42:47,894 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 13:43:12,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:12,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:12,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:15,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:15,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:16,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:12,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:12,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:13,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:13,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:16,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:13,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:13,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:13,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:13,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:43:17,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:00,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:00,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:01,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:00,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:01,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:05,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:05,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:01,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:01,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:05,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:02,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:06,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:02,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:02,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:06,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:07,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:48,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:48,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:49,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:49,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:50,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:50,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:53,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:54,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:50,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:50,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:54,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:51,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:51,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:55,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:55,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:44:57,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:36,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:36,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:37,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:37,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:37,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:38,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:38,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:38,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:39,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:42,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:39,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:43,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:43,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:44,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:47,533] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:45:46,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 13:46:52,939 (deepspeed_trainer:228) INFO: 24epoch:train:7501-7600batch: iter_time=2.509, loss_ctc=78.251, loss_att=59.481, acc=0.704, loss=65.131, grad_norm=5.285, loss_scale=1.000, learning_rate=8.251e-05, step_time=0.434 [cnode7-012:0/16] 2024-12-07 13:47:30,222 (deepspeed_trainer:228) INFO: 24epoch:train:7601-7700batch: iter_time=1.084e-04, loss_ctc=69.033, loss_att=51.824, acc=0.703, loss=56.984, grad_norm=4.579, loss_scale=1.000, learning_rate=8.250e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:48:07,832 (deepspeed_trainer:228) INFO: 24epoch:train:7701-7800batch: iter_time=1.121e-04, loss_ctc=74.639, loss_att=55.769, acc=0.718, loss=61.439, grad_norm=4.599, loss_scale=1.000, learning_rate=8.248e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 13:48:45,816 (deepspeed_trainer:228) INFO: 24epoch:train:7801-7900batch: iter_time=1.114e-04, loss_ctc=68.241, loss_att=47.636, acc=0.719, loss=53.824, grad_norm=4.426, loss_scale=1.000, learning_rate=8.247e-05, step_time=0.380 [2024-12-07 13:49:23,502] [INFO] [logging.py:129:log_dist] [Rank 0] step=353000, skipped=0, lr=[np.float64(8.245512473662623e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:49:23,502] [INFO] [timer.py:264:stop] epoch=0/micro_step=98000/global_step=98000, RunningAvgSamplesPerSec=43.81210227839693, CurrSamplesPerSec=43.74893608244232, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:49:23,504 (deepspeed_trainer:228) INFO: 24epoch:train:7901-8000batch: iter_time=1.113e-04, loss_ctc=77.452, loss_att=58.297, acc=0.712, loss=64.063, grad_norm=4.904, loss_scale=1.000, learning_rate=8.246e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 13:50:01,859 (deepspeed_trainer:228) INFO: 24epoch:train:8001-8100batch: iter_time=1.129e-04, loss_ctc=71.213, loss_att=53.517, acc=0.713, loss=58.839, grad_norm=5.029, loss_scale=1.000, learning_rate=8.245e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 13:50:39,187 (deepspeed_trainer:228) INFO: 24epoch:train:8101-8200batch: iter_time=1.159e-04, loss_ctc=72.291, loss_att=56.002, acc=0.705, loss=60.890, grad_norm=4.897, loss_scale=1.000, learning_rate=8.244e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:51:16,599 (deepspeed_trainer:228) INFO: 24epoch:train:8201-8300batch: iter_time=1.155e-04, loss_ctc=70.618, loss_att=54.362, acc=0.699, loss=59.236, grad_norm=5.226, loss_scale=1.000, learning_rate=8.243e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 13:51:53,845 (deepspeed_trainer:228) INFO: 24epoch:train:8301-8400batch: iter_time=1.162e-04, loss_ctc=77.516, loss_att=57.065, acc=0.714, loss=63.209, grad_norm=4.930, loss_scale=1.000, learning_rate=8.241e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:52:31,287 (deepspeed_trainer:228) INFO: 24epoch:train:8401-8500batch: iter_time=1.200e-04, loss_ctc=85.029, loss_att=71.435, acc=0.681, loss=75.535, grad_norm=5.672, loss_scale=1.000, learning_rate=8.240e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 13:53:08,348 (deepspeed_trainer:228) INFO: 24epoch:train:8501-8600batch: iter_time=1.136e-04, loss_ctc=69.011, loss_att=55.198, acc=0.702, loss=59.355, grad_norm=4.617, loss_scale=1.000, learning_rate=8.239e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:53:45,824 (deepspeed_trainer:228) INFO: 24epoch:train:8601-8700batch: iter_time=1.078e-04, loss_ctc=68.552, loss_att=50.811, acc=0.713, loss=56.123, grad_norm=4.941, loss_scale=1.000, learning_rate=8.238e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 13:54:23,091 (deepspeed_trainer:228) INFO: 24epoch:train:8701-8800batch: iter_time=1.077e-04, loss_ctc=76.581, loss_att=60.983, acc=0.698, loss=65.680, grad_norm=5.405, loss_scale=1.000, learning_rate=8.237e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 13:55:00,129 (deepspeed_trainer:228) INFO: 24epoch:train:8801-8900batch: iter_time=1.107e-04, loss_ctc=77.680, loss_att=49.774, acc=0.714, loss=58.154, grad_norm=4.756, loss_scale=1.000, learning_rate=8.236e-05, step_time=0.370 [2024-12-07 13:55:37,140] [INFO] [logging.py:129:log_dist] [Rank 0] step=354000, skipped=0, lr=[np.float64(8.233858066191365e-05)], mom=[[0.9, 0.98]] [2024-12-07 13:55:37,141] [INFO] [timer.py:264:stop] epoch=0/micro_step=99000/global_step=99000, RunningAvgSamplesPerSec=43.813052703892026, CurrSamplesPerSec=44.96065307213944, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 13:55:37,143 (deepspeed_trainer:228) INFO: 24epoch:train:8901-9000batch: iter_time=1.091e-04, loss_ctc=82.929, loss_att=63.654, acc=0.704, loss=69.435, grad_norm=5.223, loss_scale=1.000, learning_rate=8.234e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 13:56:14,008 (deepspeed_trainer:228) INFO: 24epoch:train:9001-9100batch: iter_time=1.092e-04, loss_ctc=79.690, loss_att=62.624, acc=0.696, loss=67.746, grad_norm=5.638, loss_scale=1.000, learning_rate=8.233e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 13:56:51,372 (deepspeed_trainer:228) INFO: 24epoch:train:9101-9200batch: iter_time=1.151e-04, loss_ctc=81.697, loss_att=65.489, acc=0.698, loss=70.347, grad_norm=5.032, loss_scale=1.000, learning_rate=8.232e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 13:57:28,348 (deepspeed_trainer:228) INFO: 24epoch:train:9201-9300batch: iter_time=1.185e-04, loss_ctc=71.617, loss_att=46.940, acc=0.733, loss=54.319, grad_norm=4.820, loss_scale=1.000, learning_rate=8.231e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 13:58:00,621 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 13:58:26,788 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 13:58:42,895 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 13:58:42,895 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 13:58:42,897 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 13:59:07,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:08,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:05,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:05,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:09,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:05,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:06,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:07,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:07,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:07,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:11,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:07,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:12,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:14,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:14,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:15,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:56,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:57,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:53,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:54,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:57,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:54,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:54,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:55,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:56,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:56,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 13:59:56,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:01,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:02,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:03,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:04,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:04,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:44,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:45,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:41,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:42,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:46,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:43,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:44,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:44,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:45,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:45,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:45,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:49,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:51,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:52,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:52,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:00:53,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:32,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:29,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:34,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:30,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:34,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:31,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:32,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:33,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:33,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:33,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:39,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:37,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:41,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:42,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:42,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:01:42,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 14:02:19,233 (deepspeed_trainer:228) INFO: 24epoch:train:9301-9400batch: iter_time=2.500, loss_ctc=76.186, loss_att=58.642, acc=0.696, loss=63.918, grad_norm=5.077, loss_scale=1.000, learning_rate=8.230e-05, step_time=0.409 [cnode7-012:0/16] 2024-12-07 14:02:57,305 (deepspeed_trainer:228) INFO: 24epoch:train:9401-9500batch: iter_time=1.077e-04, loss_ctc=77.206, loss_att=57.718, acc=0.713, loss=63.577, grad_norm=5.008, loss_scale=1.000, learning_rate=8.229e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 14:03:34,922 (deepspeed_trainer:228) INFO: 24epoch:train:9501-9600batch: iter_time=1.108e-04, loss_ctc=66.655, loss_att=48.892, acc=0.716, loss=54.247, grad_norm=4.830, loss_scale=1.000, learning_rate=8.227e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 14:04:12,579 (deepspeed_trainer:228) INFO: 24epoch:train:9601-9700batch: iter_time=1.080e-04, loss_ctc=80.099, loss_att=58.221, acc=0.707, loss=64.781, grad_norm=5.478, loss_scale=1.000, learning_rate=8.226e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 14:04:49,865 (deepspeed_trainer:228) INFO: 24epoch:train:9701-9800batch: iter_time=1.096e-04, loss_ctc=65.408, loss_att=48.678, acc=0.726, loss=53.705, grad_norm=4.234, loss_scale=1.000, learning_rate=8.225e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 14:05:27,793 (deepspeed_trainer:228) INFO: 24epoch:train:9801-9900batch: iter_time=1.085e-04, loss_ctc=78.890, loss_att=60.439, acc=0.713, loss=65.969, grad_norm=5.110, loss_scale=1.000, learning_rate=8.224e-05, step_time=0.379 [2024-12-07 14:06:06,102] [INFO] [logging.py:129:log_dist] [Rank 0] step=355000, skipped=0, lr=[np.float64(8.222252937300596e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:06:06,102] [INFO] [timer.py:264:stop] epoch=0/micro_step=100000/global_step=100000, RunningAvgSamplesPerSec=43.807531081033616, CurrSamplesPerSec=43.658315334742035, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:06:06,104 (deepspeed_trainer:228) INFO: 24epoch:train:9901-10000batch: iter_time=1.083e-04, loss_ctc=68.801, loss_att=52.719, acc=0.711, loss=57.542, grad_norm=4.496, loss_scale=1.000, learning_rate=8.223e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 14:06:44,310 (deepspeed_trainer:228) INFO: 24epoch:train:10001-10100batch: iter_time=1.064e-04, loss_ctc=69.225, loss_att=53.558, acc=0.714, loss=58.278, grad_norm=5.389, loss_scale=1.000, learning_rate=8.222e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-07 14:07:22,651 (deepspeed_trainer:228) INFO: 24epoch:train:10101-10200batch: iter_time=1.070e-04, loss_ctc=75.768, loss_att=59.640, acc=0.718, loss=64.502, grad_norm=4.492, loss_scale=1.000, learning_rate=8.221e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 14:08:00,297 (deepspeed_trainer:228) INFO: 24epoch:train:10201-10300batch: iter_time=1.071e-04, loss_ctc=77.501, loss_att=57.912, acc=0.706, loss=63.767, grad_norm=5.746, loss_scale=1.000, learning_rate=8.219e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 14:08:37,880 (deepspeed_trainer:228) INFO: 24epoch:train:10301-10400batch: iter_time=1.073e-04, loss_ctc=77.994, loss_att=66.595, acc=0.701, loss=69.996, grad_norm=5.168, loss_scale=1.000, learning_rate=8.218e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 14:09:15,078 (deepspeed_trainer:228) INFO: 24epoch:train:10401-10500batch: iter_time=1.066e-04, loss_ctc=68.153, loss_att=55.225, acc=0.712, loss=59.123, grad_norm=4.664, loss_scale=1.000, learning_rate=8.217e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 14:09:52,921 (deepspeed_trainer:228) INFO: 24epoch:train:10501-10600batch: iter_time=1.090e-04, loss_ctc=76.448, loss_att=57.452, acc=0.709, loss=63.132, grad_norm=5.661, loss_scale=1.000, learning_rate=8.216e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 14:10:30,132 (deepspeed_trainer:228) INFO: 24epoch:train:10601-10700batch: iter_time=1.105e-04, loss_ctc=81.743, loss_att=57.891, acc=0.711, loss=65.032, grad_norm=5.158, loss_scale=1.000, learning_rate=8.215e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 14:11:07,377 (deepspeed_trainer:228) INFO: 24epoch:train:10701-10800batch: iter_time=1.083e-04, loss_ctc=67.473, loss_att=48.738, acc=0.725, loss=54.400, grad_norm=4.375, loss_scale=1.000, learning_rate=8.214e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 14:11:45,369 (deepspeed_trainer:228) INFO: 24epoch:train:10801-10900batch: iter_time=1.084e-04, loss_ctc=84.914, loss_att=69.510, acc=0.693, loss=74.135, grad_norm=5.430, loss_scale=1.000, learning_rate=8.212e-05, step_time=0.380 [2024-12-07 14:12:22,964] [INFO] [logging.py:129:log_dist] [Rank 0] step=356000, skipped=0, lr=[np.float64(8.210696740690254e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:12:22,965] [INFO] [timer.py:264:stop] epoch=0/micro_step=101000/global_step=101000, RunningAvgSamplesPerSec=43.804133844391195, CurrSamplesPerSec=43.18320234827133, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:12:22,966 (deepspeed_trainer:228) INFO: 24epoch:train:10901-11000batch: iter_time=1.062e-04, loss_ctc=82.748, loss_att=63.976, acc=0.705, loss=69.602, grad_norm=5.363, loss_scale=1.000, learning_rate=8.211e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 14:13:00,614 (deepspeed_trainer:228) INFO: 24epoch:train:11001-11100batch: iter_time=1.076e-04, loss_ctc=77.356, loss_att=59.705, acc=0.715, loss=64.976, grad_norm=4.960, loss_scale=1.000, learning_rate=8.210e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 14:13:37,742 (deepspeed_trainer:228) INFO: 24epoch:train:11101-11200batch: iter_time=1.072e-04, loss_ctc=67.588, loss_att=47.610, acc=0.722, loss=53.603, grad_norm=5.272, loss_scale=1.000, learning_rate=8.209e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 14:14:00,806 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 14:14:27,670 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 14:14:44,430 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 14:14:44,430 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 14:14:44,433 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 14:15:08,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:08,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:06,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:06,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:06,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:06,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:07,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:10,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:08,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:08,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:08,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:11,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:12,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:12,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:13,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:13,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:56,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:56,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:58,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:55,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:55,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:55,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:56,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:56,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:57,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:15:57,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:01,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:02,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:02,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:02,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:02,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:07,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:43,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:45,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:43,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:44,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:47,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:44,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:44,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:44,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:45,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:50,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:51,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:47,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:51,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:51,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:16:52,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:01,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:30,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:33,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:30,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:31,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:32,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:32,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:32,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:33,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:38,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:38,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:39,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:40,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:40,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:40,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:38,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:17:50,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 14:18:36,314 (deepspeed_trainer:228) INFO: 24epoch:train:11201-11300batch: iter_time=2.515, loss_ctc=81.540, loss_att=62.999, acc=0.696, loss=68.548, grad_norm=5.223, loss_scale=1.000, learning_rate=8.208e-05, step_time=0.471 [cnode7-012:0/16] 2024-12-07 14:19:13,321 (deepspeed_trainer:228) INFO: 24epoch:train:11301-11400batch: iter_time=1.096e-04, loss_ctc=76.205, loss_att=56.384, acc=0.709, loss=62.331, grad_norm=4.804, loss_scale=1.000, learning_rate=8.207e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:19:50,156 (deepspeed_trainer:228) INFO: 24epoch:train:11401-11500batch: iter_time=1.285e-04, loss_ctc=67.006, loss_att=51.062, acc=0.713, loss=55.842, grad_norm=4.653, loss_scale=1.000, learning_rate=8.206e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 14:20:27,361 (deepspeed_trainer:228) INFO: 24epoch:train:11501-11600batch: iter_time=1.103e-04, loss_ctc=75.993, loss_att=53.864, acc=0.712, loss=60.493, grad_norm=5.411, loss_scale=1.000, learning_rate=8.204e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 14:21:04,364 (deepspeed_trainer:228) INFO: 24epoch:train:11601-11700batch: iter_time=1.083e-04, loss_ctc=64.719, loss_att=48.267, acc=0.722, loss=53.187, grad_norm=4.133, loss_scale=1.000, learning_rate=8.203e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:21:41,434 (deepspeed_trainer:228) INFO: 24epoch:train:11701-11800batch: iter_time=1.088e-04, loss_ctc=82.130, loss_att=60.813, acc=0.710, loss=67.228, grad_norm=5.133, loss_scale=1.000, learning_rate=8.202e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:22:18,098 (deepspeed_trainer:228) INFO: 24epoch:train:11801-11900batch: iter_time=1.089e-04, loss_ctc=67.799, loss_att=51.983, acc=0.710, loss=56.724, grad_norm=4.831, loss_scale=1.000, learning_rate=8.201e-05, step_time=0.366 [2024-12-07 14:22:54,859] [INFO] [logging.py:129:log_dist] [Rank 0] step=357000, skipped=0, lr=[np.float64(8.19918913345775e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:22:54,860] [INFO] [timer.py:264:stop] epoch=0/micro_step=102000/global_step=102000, RunningAvgSamplesPerSec=43.79662060756147, CurrSamplesPerSec=46.77533579100189, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:22:54,862 (deepspeed_trainer:228) INFO: 24epoch:train:11901-12000batch: iter_time=1.082e-04, loss_ctc=70.591, loss_att=56.863, acc=0.704, loss=60.975, grad_norm=4.567, loss_scale=1.000, learning_rate=8.200e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 14:23:31,867 (deepspeed_trainer:228) INFO: 24epoch:train:12001-12100batch: iter_time=1.143e-04, loss_ctc=73.884, loss_att=54.526, acc=0.717, loss=60.330, grad_norm=4.770, loss_scale=1.000, learning_rate=8.199e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:24:08,791 (deepspeed_trainer:228) INFO: 24epoch:train:12101-12200batch: iter_time=1.075e-04, loss_ctc=79.547, loss_att=60.138, acc=0.691, loss=65.960, grad_norm=5.613, loss_scale=1.000, learning_rate=8.197e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 14:24:45,776 (deepspeed_trainer:228) INFO: 24epoch:train:12201-12300batch: iter_time=1.090e-04, loss_ctc=73.634, loss_att=60.878, acc=0.704, loss=64.709, grad_norm=4.472, loss_scale=1.000, learning_rate=8.196e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:25:22,454 (deepspeed_trainer:228) INFO: 24epoch:train:12301-12400batch: iter_time=1.066e-04, loss_ctc=64.861, loss_att=49.361, acc=0.712, loss=54.035, grad_norm=4.750, loss_scale=1.000, learning_rate=8.195e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 14:25:59,130 (deepspeed_trainer:228) INFO: 24epoch:train:12401-12500batch: iter_time=1.131e-04, loss_ctc=77.720, loss_att=59.069, acc=0.693, loss=64.664, grad_norm=5.820, loss_scale=1.000, learning_rate=8.194e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 14:26:36,143 (deepspeed_trainer:228) INFO: 24epoch:train:12501-12600batch: iter_time=1.065e-04, loss_ctc=81.591, loss_att=58.010, acc=0.710, loss=65.071, grad_norm=5.154, loss_scale=1.000, learning_rate=8.193e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:27:12,860 (deepspeed_trainer:228) INFO: 24epoch:train:12601-12700batch: iter_time=1.084e-04, loss_ctc=69.224, loss_att=50.511, acc=0.714, loss=56.100, grad_norm=4.518, loss_scale=1.000, learning_rate=8.192e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 14:27:49,973 (deepspeed_trainer:228) INFO: 24epoch:train:12701-12800batch: iter_time=1.085e-04, loss_ctc=82.417, loss_att=65.301, acc=0.702, loss=70.470, grad_norm=5.041, loss_scale=1.000, learning_rate=8.191e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 14:28:26,982 (deepspeed_trainer:228) INFO: 24epoch:train:12801-12900batch: iter_time=1.101e-04, loss_ctc=88.086, loss_att=65.086, acc=0.699, loss=71.993, grad_norm=5.611, loss_scale=1.000, learning_rate=8.189e-05, step_time=0.370 [2024-12-07 14:29:04,030] [INFO] [logging.py:129:log_dist] [Rank 0] step=358000, skipped=0, lr=[np.float64(8.18772977605523e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:29:04,030] [INFO] [timer.py:264:stop] epoch=0/micro_step=103000/global_step=103000, RunningAvgSamplesPerSec=43.80221019678368, CurrSamplesPerSec=45.51703181014418, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:29:04,032 (deepspeed_trainer:228) INFO: 24epoch:train:12901-13000batch: iter_time=1.095e-04, loss_ctc=74.493, loss_att=58.359, acc=0.720, loss=63.189, grad_norm=5.101, loss_scale=1.000, learning_rate=8.188e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:29:40,618 (deepspeed_trainer:228) INFO: 24epoch:train:13001-13100batch: iter_time=1.092e-04, loss_ctc=67.948, loss_att=48.475, acc=0.705, loss=54.315, grad_norm=4.967, loss_scale=1.000, learning_rate=8.187e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 14:29:54,236 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 14:30:21,123 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 14:30:37,622 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 14:30:37,622 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 14:30:37,624 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 14:31:03,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:03,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:00,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:00,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:04,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:01,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:01,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:05,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:01,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:05,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:02,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:02,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:02,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:06,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:06,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:08,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:51,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:51,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:49,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:52,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:49,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:49,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:50,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:50,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:50,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:51,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:55,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:55,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:56,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:56,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:53,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:31:58,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:38,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:38,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:37,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:41,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:38,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:38,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:38,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:38,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:39,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:39,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:45,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:45,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:45,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:46,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:43,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:32:47,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:26,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:26,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:25,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:29,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:26,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:26,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:27,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:27,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:27,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:27,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:33,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:33,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:34,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:35,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:35,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:33:33,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 14:34:31,220 (deepspeed_trainer:228) INFO: 24epoch:train:13101-13200batch: iter_time=2.505, loss_ctc=82.595, loss_att=62.271, acc=0.700, loss=68.362, grad_norm=5.116, loss_scale=1.000, learning_rate=8.186e-05, step_time=0.402 [cnode7-012:0/16] 2024-12-07 14:35:09,152 (deepspeed_trainer:228) INFO: 24epoch:train:13201-13300batch: iter_time=1.081e-04, loss_ctc=68.209, loss_att=50.097, acc=0.708, loss=55.548, grad_norm=4.782, loss_scale=1.000, learning_rate=8.185e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 14:35:47,061 (deepspeed_trainer:228) INFO: 24epoch:train:13301-13400batch: iter_time=1.019e-04, loss_ctc=76.839, loss_att=55.793, acc=0.718, loss=62.085, grad_norm=5.203, loss_scale=1.000, learning_rate=8.184e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 14:36:23,886 (deepspeed_trainer:228) INFO: 24epoch:train:13401-13500batch: iter_time=1.072e-04, loss_ctc=68.524, loss_att=47.735, acc=0.719, loss=53.980, grad_norm=5.328, loss_scale=1.000, learning_rate=8.183e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 14:37:00,976 (deepspeed_trainer:228) INFO: 24epoch:train:13501-13600batch: iter_time=1.069e-04, loss_ctc=68.776, loss_att=52.235, acc=0.718, loss=57.199, grad_norm=4.907, loss_scale=1.000, learning_rate=8.181e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 14:37:38,232 (deepspeed_trainer:228) INFO: 24epoch:train:13601-13700batch: iter_time=1.074e-04, loss_ctc=75.233, loss_att=55.004, acc=0.718, loss=61.086, grad_norm=4.356, loss_scale=1.000, learning_rate=8.180e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 14:38:15,394 (deepspeed_trainer:228) INFO: 24epoch:train:13701-13800batch: iter_time=1.066e-04, loss_ctc=71.426, loss_att=55.241, acc=0.707, loss=60.069, grad_norm=4.743, loss_scale=1.000, learning_rate=8.179e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 14:38:52,511 (deepspeed_trainer:228) INFO: 24epoch:train:13801-13900batch: iter_time=1.105e-04, loss_ctc=68.194, loss_att=53.829, acc=0.705, loss=58.149, grad_norm=4.546, loss_scale=1.000, learning_rate=8.178e-05, step_time=0.371 [2024-12-07 14:39:29,516] [INFO] [logging.py:129:log_dist] [Rank 0] step=359000, skipped=0, lr=[np.float64(8.176318332247504e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:39:29,516] [INFO] [timer.py:264:stop] epoch=0/micro_step=104000/global_step=104000, RunningAvgSamplesPerSec=43.801093037584984, CurrSamplesPerSec=44.292035015711654, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:39:29,518 (deepspeed_trainer:228) INFO: 24epoch:train:13901-14000batch: iter_time=1.111e-04, loss_ctc=77.561, loss_att=57.663, acc=0.714, loss=63.648, grad_norm=4.754, loss_scale=1.000, learning_rate=8.177e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:40:06,320 (deepspeed_trainer:228) INFO: 24epoch:train:14001-14100batch: iter_time=1.064e-04, loss_ctc=82.190, loss_att=65.124, acc=0.691, loss=70.250, grad_norm=5.292, loss_scale=1.000, learning_rate=8.176e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 14:40:42,925 (deepspeed_trainer:228) INFO: 24epoch:train:14101-14200batch: iter_time=1.073e-04, loss_ctc=68.257, loss_att=54.526, acc=0.706, loss=58.646, grad_norm=4.550, loss_scale=1.000, learning_rate=8.175e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 14:41:19,880 (deepspeed_trainer:228) INFO: 24epoch:train:14201-14300batch: iter_time=1.095e-04, loss_ctc=68.866, loss_att=53.334, acc=0.709, loss=57.995, grad_norm=4.387, loss_scale=1.000, learning_rate=8.173e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 14:41:56,442 (deepspeed_trainer:228) INFO: 24epoch:train:14301-14400batch: iter_time=1.072e-04, loss_ctc=77.052, loss_att=58.555, acc=0.703, loss=64.119, grad_norm=5.592, loss_scale=1.000, learning_rate=8.172e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 14:42:33,137 (deepspeed_trainer:228) INFO: 24epoch:train:14401-14500batch: iter_time=1.070e-04, loss_ctc=75.224, loss_att=51.738, acc=0.711, loss=58.762, grad_norm=5.094, loss_scale=1.000, learning_rate=8.171e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 14:43:10,084 (deepspeed_trainer:228) INFO: 24epoch:train:14501-14600batch: iter_time=1.081e-04, loss_ctc=72.831, loss_att=54.474, acc=0.711, loss=59.978, grad_norm=4.480, loss_scale=1.000, learning_rate=8.170e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 14:43:47,212 (deepspeed_trainer:228) INFO: 24epoch:train:14601-14700batch: iter_time=1.126e-04, loss_ctc=83.282, loss_att=65.921, acc=0.705, loss=71.134, grad_norm=5.312, loss_scale=1.000, learning_rate=8.169e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 14:44:24,230 (deepspeed_trainer:228) INFO: 24epoch:train:14701-14800batch: iter_time=1.062e-04, loss_ctc=86.449, loss_att=66.937, acc=0.694, loss=72.775, grad_norm=5.015, loss_scale=1.000, learning_rate=8.168e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:45:01,066 (deepspeed_trainer:228) INFO: 24epoch:train:14801-14900batch: iter_time=1.075e-04, loss_ctc=73.528, loss_att=51.533, acc=0.726, loss=58.111, grad_norm=5.284, loss_scale=1.000, learning_rate=8.167e-05, step_time=0.368 [2024-12-07 14:45:37,838] [INFO] [logging.py:129:log_dist] [Rank 0] step=360000, skipped=0, lr=[np.float64(8.164954469070595e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:45:37,838] [INFO] [timer.py:264:stop] epoch=0/micro_step=105000/global_step=105000, RunningAvgSamplesPerSec=43.8075212147137, CurrSamplesPerSec=46.874887729565366, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:45:37,840 (deepspeed_trainer:228) INFO: 24epoch:train:14901-15000batch: iter_time=1.054e-04, loss_ctc=72.051, loss_att=52.978, acc=0.706, loss=58.696, grad_norm=4.703, loss_scale=1.000, learning_rate=8.166e-05, step_time=0.368 [2024-12-07 14:45:53,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:53,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:53,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:53,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:53,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:53,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:45:54,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:08,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:08,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:09,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:08,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:09,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:11,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,901] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:11,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:10,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:11,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:24,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:24,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:25,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:24,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:24,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:25,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:26,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:26,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:25,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:25,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:26,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:26,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:26,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:26,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:27,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:28,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:39,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:39,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:39,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:40,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:40,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:40,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:40,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:41,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:41,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:41,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:42,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:41,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:41,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:43,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:42,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:43,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:46:56,049] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 24 is about to be saved! [2024-12-07 14:46:56,081] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/mp_rank_00_model_states.pt [2024-12-07 14:46:56,081] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/mp_rank_00_model_states.pt... [2024-12-07 14:46:58,018] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/mp_rank_00_model_states.pt. [2024-12-07 14:46:58,181] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,181] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,469] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,182] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,182] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,471] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,471] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,471] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,472] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,472] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,472] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,185] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,185] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,186] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,186] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 14:46:57,474] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 14:46:58,763] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,763] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,763] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,817] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,817] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,817] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,926] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,926] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,926] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,235] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,236] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,236] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,953] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,953] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,953] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,957] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,957] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,957] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,252] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,253] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,253] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,254] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,254] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,255] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,976] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,978] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,978] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,978] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,979] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,980] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,980] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,275] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,275] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,275] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,276] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,276] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,276] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,305] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,305] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,322] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,322] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,322] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:58,324] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 14:46:58,324] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 14:46:58,324] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [2024-12-07 14:46:59,109] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_24/24/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 14:46:59,109] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 24 is ready now! [cnode7-012:0/16] 2024-12-07 14:46:59,124 (deepspeed_trainer:158) INFO: 24epoch results: [train] iter_time=0.133, loss_ctc=75.359, loss_att=57.013, acc=0.708, loss=62.518, grad_norm=4.977, loss_scale=1.000, learning_rate=8.252e-05, step_time=0.375, time=2 hours, 7 minutes and 20.29 seconds, total_count=360024, gpu_max_cached_mem_GB=27.986, [valid] loss_ctc=3.828, cer_ctc=0.096, loss_att=6.688, acc=0.813, cer=0.319, wer=0.938, loss=5.844, time=1 minute and 8.77 seconds, total_count=24, gpu_max_cached_mem_GB=27.986 [cnode7-012:0/16] 2024-12-07 14:47:01,334 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 14:47:28,491 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 14:47:44,198 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 14:47:44,198 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 14:47:44,200 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 14:48:02,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:03,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:03,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:03,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:04,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:04,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:04,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:05,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:05,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:05,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:05,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:05,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:05,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:06,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:07,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:08,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:50,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:50,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:51,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:51,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:52,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:52,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:53,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:54,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:54,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:54,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:55,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:55,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:55,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:55,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:56,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:48:57,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:36,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:38,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:38,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:38,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:39,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:40,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:40,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:41,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:42,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:42,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:43,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:43,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:44,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:44,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:45,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:49:45,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:24,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:25,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:25,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:26,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:26,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:28,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:28,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:30,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:31,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:31,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:31,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:31,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:33,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:33,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:34,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 14:50:34,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 14:51:39,118 (deepspeed_trainer:228) INFO: 25epoch:train:1-100batch: iter_time=2.400, loss_ctc=74.411, loss_att=52.700, acc=0.712, loss=59.205, grad_norm=4.620, loss_scale=1.000, learning_rate=8.164e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-07 14:52:16,775 (deepspeed_trainer:228) INFO: 25epoch:train:101-200batch: iter_time=1.093e-04, loss_ctc=83.143, loss_att=65.133, acc=0.699, loss=70.514, grad_norm=5.672, loss_scale=1.000, learning_rate=8.163e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 14:52:54,315 (deepspeed_trainer:228) INFO: 25epoch:train:201-300batch: iter_time=1.061e-04, loss_ctc=85.034, loss_att=71.534, acc=0.677, loss=75.568, grad_norm=5.833, loss_scale=1.000, learning_rate=8.162e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 14:53:31,344 (deepspeed_trainer:228) INFO: 25epoch:train:301-400batch: iter_time=1.085e-04, loss_ctc=76.706, loss_att=55.491, acc=0.713, loss=61.869, grad_norm=5.013, loss_scale=1.000, learning_rate=8.161e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 14:54:08,657 (deepspeed_trainer:228) INFO: 25epoch:train:401-500batch: iter_time=1.183e-04, loss_ctc=75.387, loss_att=54.619, acc=0.702, loss=60.851, grad_norm=5.499, loss_scale=1.000, learning_rate=8.160e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 14:54:45,912 (deepspeed_trainer:228) INFO: 25epoch:train:501-600batch: iter_time=1.121e-04, loss_ctc=83.477, loss_att=56.230, acc=0.702, loss=64.424, grad_norm=7.300, loss_scale=1.000, learning_rate=8.159e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 14:55:22,862 (deepspeed_trainer:228) INFO: 25epoch:train:601-700batch: iter_time=1.052e-04, loss_ctc=76.256, loss_att=53.905, acc=0.708, loss=60.595, grad_norm=5.733, loss_scale=1.000, learning_rate=8.158e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 14:56:00,028 (deepspeed_trainer:228) INFO: 25epoch:train:701-800batch: iter_time=1.106e-04, loss_ctc=66.407, loss_att=46.948, acc=0.711, loss=52.810, grad_norm=4.551, loss_scale=1.000, learning_rate=8.156e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 14:56:37,593 (deepspeed_trainer:228) INFO: 25epoch:train:801-900batch: iter_time=1.094e-04, loss_ctc=79.216, loss_att=53.624, acc=0.714, loss=61.288, grad_norm=5.405, loss_scale=1.000, learning_rate=8.155e-05, step_time=0.376 [2024-12-07 14:57:14,905] [INFO] [logging.py:129:log_dist] [Rank 0] step=361000, skipped=0, lr=[np.float64(8.153637856790947e-05)], mom=[[0.9, 0.98]] [2024-12-07 14:57:14,906] [INFO] [timer.py:264:stop] epoch=0/micro_step=106000/global_step=106000, RunningAvgSamplesPerSec=43.807631429859704, CurrSamplesPerSec=45.431460806139846, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 14:57:14,908 (deepspeed_trainer:228) INFO: 25epoch:train:901-1000batch: iter_time=1.091e-04, loss_ctc=75.531, loss_att=53.475, acc=0.712, loss=60.077, grad_norm=5.310, loss_scale=1.000, learning_rate=8.154e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 14:57:52,468 (deepspeed_trainer:228) INFO: 25epoch:train:1001-1100batch: iter_time=1.069e-04, loss_ctc=79.537, loss_att=62.968, acc=0.696, loss=67.935, grad_norm=4.762, loss_scale=1.000, learning_rate=8.153e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 14:58:30,227 (deepspeed_trainer:228) INFO: 25epoch:train:1101-1200batch: iter_time=1.082e-04, loss_ctc=72.164, loss_att=50.462, acc=0.714, loss=56.969, grad_norm=4.929, loss_scale=1.000, learning_rate=8.152e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 14:59:07,727 (deepspeed_trainer:228) INFO: 25epoch:train:1201-1300batch: iter_time=1.075e-04, loss_ctc=71.674, loss_att=54.759, acc=0.712, loss=59.833, grad_norm=4.854, loss_scale=1.000, learning_rate=8.151e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 14:59:45,014 (deepspeed_trainer:228) INFO: 25epoch:train:1301-1400batch: iter_time=1.070e-04, loss_ctc=79.692, loss_att=57.175, acc=0.710, loss=63.892, grad_norm=4.797, loss_scale=1.000, learning_rate=8.150e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:00:21,712 (deepspeed_trainer:228) INFO: 25epoch:train:1401-1500batch: iter_time=1.100e-04, loss_ctc=68.437, loss_att=49.669, acc=0.705, loss=55.303, grad_norm=4.510, loss_scale=1.000, learning_rate=8.149e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 15:00:58,788 (deepspeed_trainer:228) INFO: 25epoch:train:1501-1600batch: iter_time=1.088e-04, loss_ctc=74.613, loss_att=52.926, acc=0.712, loss=59.418, grad_norm=5.052, loss_scale=1.000, learning_rate=8.147e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 15:01:35,860 (deepspeed_trainer:228) INFO: 25epoch:train:1601-1700batch: iter_time=1.179e-04, loss_ctc=67.021, loss_att=50.572, acc=0.706, loss=55.496, grad_norm=4.515, loss_scale=1.000, learning_rate=8.146e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 15:02:13,462 (deepspeed_trainer:228) INFO: 25epoch:train:1701-1800batch: iter_time=1.096e-04, loss_ctc=87.839, loss_att=62.072, acc=0.700, loss=69.803, grad_norm=5.133, loss_scale=1.000, learning_rate=8.145e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 15:02:45,970 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 15:03:12,821 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 15:03:28,599 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 15:03:28,599 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 15:03:28,602 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 15:03:53,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:53,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:54,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:54,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:55,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:55,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:55,699] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:56,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:57,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:57,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:57,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:57,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:57,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:58,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:58,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:03:59,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:41,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:42,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:43,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:44,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:45,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:45,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:45,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:45,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:45,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:46,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:46,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:47,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:48,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:48,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:48,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:04:49,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:28,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:28,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:30,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:32,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:33,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:33,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:34,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:34,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:34,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:34,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:34,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:36,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:37,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:37,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:38,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:05:39,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:17,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:17,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:18,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:20,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:20,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:21,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:21,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:21,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:22,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:22,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:24,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:25,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:25,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:26,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:28,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:06:29,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 15:07:04,638 (deepspeed_trainer:228) INFO: 25epoch:train:1801-1900batch: iter_time=2.485, loss_ctc=73.869, loss_att=53.257, acc=0.712, loss=59.426, grad_norm=5.182, loss_scale=1.000, learning_rate=8.144e-05, step_time=0.421 [2024-12-07 15:07:42,659] [INFO] [logging.py:129:log_dist] [Rank 0] step=362000, skipped=0, lr=[np.float64(8.142368168865231e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:07:42,660] [INFO] [timer.py:264:stop] epoch=0/micro_step=107000/global_step=107000, RunningAvgSamplesPerSec=43.80307384904385, CurrSamplesPerSec=42.29244979767708, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:07:42,661 (deepspeed_trainer:228) INFO: 25epoch:train:1901-2000batch: iter_time=1.106e-04, loss_ctc=90.211, loss_att=64.534, acc=0.701, loss=72.261, grad_norm=6.172, loss_scale=1.000, learning_rate=8.143e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 15:08:20,091 (deepspeed_trainer:228) INFO: 25epoch:train:2001-2100batch: iter_time=1.073e-04, loss_ctc=74.807, loss_att=58.197, acc=0.706, loss=63.173, grad_norm=4.798, loss_scale=1.000, learning_rate=8.142e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:08:57,748 (deepspeed_trainer:228) INFO: 25epoch:train:2101-2200batch: iter_time=1.110e-04, loss_ctc=81.357, loss_att=67.998, acc=0.687, loss=72.028, grad_norm=5.880, loss_scale=1.000, learning_rate=8.141e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 15:09:35,236 (deepspeed_trainer:228) INFO: 25epoch:train:2201-2300batch: iter_time=1.140e-04, loss_ctc=77.057, loss_att=57.053, acc=0.708, loss=63.071, grad_norm=4.882, loss_scale=1.000, learning_rate=8.140e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:10:12,477 (deepspeed_trainer:228) INFO: 25epoch:train:2301-2400batch: iter_time=1.154e-04, loss_ctc=76.164, loss_att=55.589, acc=0.714, loss=61.746, grad_norm=4.837, loss_scale=1.000, learning_rate=8.138e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:10:49,583 (deepspeed_trainer:228) INFO: 25epoch:train:2401-2500batch: iter_time=1.123e-04, loss_ctc=74.547, loss_att=50.737, acc=0.706, loss=57.876, grad_norm=5.521, loss_scale=1.000, learning_rate=8.137e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 15:11:26,805 (deepspeed_trainer:228) INFO: 25epoch:train:2501-2600batch: iter_time=1.111e-04, loss_ctc=74.391, loss_att=50.566, acc=0.719, loss=57.681, grad_norm=5.989, loss_scale=1.000, learning_rate=8.136e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:12:03,894 (deepspeed_trainer:228) INFO: 25epoch:train:2601-2700batch: iter_time=1.120e-04, loss_ctc=69.892, loss_att=50.640, acc=0.707, loss=56.415, grad_norm=5.091, loss_scale=1.000, learning_rate=8.135e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 15:12:40,962 (deepspeed_trainer:228) INFO: 25epoch:train:2701-2800batch: iter_time=1.103e-04, loss_ctc=76.154, loss_att=51.110, acc=0.713, loss=58.640, grad_norm=5.430, loss_scale=1.000, learning_rate=8.134e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 15:13:18,207 (deepspeed_trainer:228) INFO: 25epoch:train:2801-2900batch: iter_time=1.123e-04, loss_ctc=75.594, loss_att=52.973, acc=0.713, loss=59.775, grad_norm=4.693, loss_scale=1.000, learning_rate=8.133e-05, step_time=0.372 [2024-12-07 15:13:55,674] [INFO] [logging.py:129:log_dist] [Rank 0] step=363000, skipped=0, lr=[np.float64(8.131145081900776e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:13:55,674] [INFO] [timer.py:264:stop] epoch=0/micro_step=108000/global_step=108000, RunningAvgSamplesPerSec=43.80484728722333, CurrSamplesPerSec=46.68471310705494, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:13:55,676 (deepspeed_trainer:228) INFO: 25epoch:train:2901-3000batch: iter_time=1.117e-04, loss_ctc=78.630, loss_att=61.601, acc=0.704, loss=66.733, grad_norm=5.137, loss_scale=1.000, learning_rate=8.132e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:14:33,017 (deepspeed_trainer:228) INFO: 25epoch:train:3001-3100batch: iter_time=1.119e-04, loss_ctc=72.846, loss_att=54.766, acc=0.707, loss=60.219, grad_norm=4.821, loss_scale=1.000, learning_rate=8.131e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:15:10,331 (deepspeed_trainer:228) INFO: 25epoch:train:3101-3200batch: iter_time=1.135e-04, loss_ctc=67.262, loss_att=51.665, acc=0.720, loss=56.362, grad_norm=4.763, loss_scale=1.000, learning_rate=8.129e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:15:47,823 (deepspeed_trainer:228) INFO: 25epoch:train:3201-3300batch: iter_time=1.136e-04, loss_ctc=74.694, loss_att=52.817, acc=0.714, loss=59.367, grad_norm=4.836, loss_scale=1.000, learning_rate=8.128e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:16:25,401 (deepspeed_trainer:228) INFO: 25epoch:train:3301-3400batch: iter_time=1.138e-04, loss_ctc=73.271, loss_att=51.396, acc=0.710, loss=57.958, grad_norm=4.769, loss_scale=1.000, learning_rate=8.127e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:17:02,915 (deepspeed_trainer:228) INFO: 25epoch:train:3401-3500batch: iter_time=1.112e-04, loss_ctc=75.343, loss_att=55.551, acc=0.709, loss=61.472, grad_norm=5.675, loss_scale=1.000, learning_rate=8.126e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:17:40,121 (deepspeed_trainer:228) INFO: 25epoch:train:3501-3600batch: iter_time=1.103e-04, loss_ctc=66.751, loss_att=48.313, acc=0.717, loss=53.874, grad_norm=4.600, loss_scale=1.000, learning_rate=8.125e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:18:17,645 (deepspeed_trainer:228) INFO: 25epoch:train:3601-3700batch: iter_time=1.156e-04, loss_ctc=87.316, loss_att=64.362, acc=0.694, loss=71.252, grad_norm=5.310, loss_scale=1.000, learning_rate=8.124e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:18:41,099 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 15:19:08,309 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 15:19:25,859 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 15:19:25,859 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 15:19:25,861 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 15:19:48,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:49,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:48,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:48,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:49,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:48,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:49,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:49,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:49,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:49,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:50,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:52,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:52,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:53,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:54,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:19:55,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:36,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:36,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:37,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:36,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:37,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:37,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:37,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:38,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:38,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:38,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:39,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:40,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:42,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:43,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:44,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:20:44,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:23,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:24,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:24,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:24,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:25,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:25,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:25,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:26,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:26,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:27,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:29,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:29,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:31,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:32,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:33,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:21:33,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:10,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:12,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:12,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:11,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:12,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:13,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:13,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:13,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:14,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:14,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:18,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:19,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:20,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:20,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:21,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:22:23,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 15:23:08,961 (deepspeed_trainer:228) INFO: 25epoch:train:3701-3800batch: iter_time=2.518, loss_ctc=77.001, loss_att=53.108, acc=0.716, loss=60.267, grad_norm=4.790, loss_scale=1.000, learning_rate=8.123e-05, step_time=0.395 [cnode7-012:0/16] 2024-12-07 15:23:46,599 (deepspeed_trainer:228) INFO: 25epoch:train:3801-3900batch: iter_time=1.134e-04, loss_ctc=87.666, loss_att=64.541, acc=0.706, loss=71.483, grad_norm=5.928, loss_scale=1.000, learning_rate=8.122e-05, step_time=0.376 [2024-12-07 15:24:23,888] [INFO] [logging.py:129:log_dist] [Rank 0] step=364000, skipped=0, lr=[np.float64(8.119968275616586e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:24:23,889] [INFO] [timer.py:264:stop] epoch=0/micro_step=109000/global_step=109000, RunningAvgSamplesPerSec=43.80295246794389, CurrSamplesPerSec=43.395481495714286, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:24:23,891 (deepspeed_trainer:228) INFO: 25epoch:train:3901-4000batch: iter_time=1.102e-04, loss_ctc=74.266, loss_att=57.978, acc=0.700, loss=62.839, grad_norm=5.716, loss_scale=1.000, learning_rate=8.121e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:25:01,371 (deepspeed_trainer:228) INFO: 25epoch:train:4001-4100batch: iter_time=1.079e-04, loss_ctc=76.594, loss_att=61.951, acc=0.699, loss=66.323, grad_norm=5.426, loss_scale=1.000, learning_rate=8.119e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:25:38,668 (deepspeed_trainer:228) INFO: 25epoch:train:4101-4200batch: iter_time=1.115e-04, loss_ctc=77.179, loss_att=56.784, acc=0.703, loss=62.901, grad_norm=5.450, loss_scale=1.000, learning_rate=8.118e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:26:16,121 (deepspeed_trainer:228) INFO: 25epoch:train:4201-4300batch: iter_time=1.101e-04, loss_ctc=82.891, loss_att=56.698, acc=0.716, loss=64.543, grad_norm=5.333, loss_scale=1.000, learning_rate=8.117e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:26:53,442 (deepspeed_trainer:228) INFO: 25epoch:train:4301-4400batch: iter_time=1.102e-04, loss_ctc=70.955, loss_att=52.844, acc=0.703, loss=58.251, grad_norm=5.706, loss_scale=1.000, learning_rate=8.116e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:27:30,622 (deepspeed_trainer:228) INFO: 25epoch:train:4401-4500batch: iter_time=1.117e-04, loss_ctc=69.726, loss_att=48.936, acc=0.720, loss=55.143, grad_norm=4.515, loss_scale=1.000, learning_rate=8.115e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 15:28:07,924 (deepspeed_trainer:228) INFO: 25epoch:train:4501-4600batch: iter_time=1.091e-04, loss_ctc=67.515, loss_att=46.283, acc=0.714, loss=52.661, grad_norm=5.126, loss_scale=1.000, learning_rate=8.114e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:28:45,332 (deepspeed_trainer:228) INFO: 25epoch:train:4601-4700batch: iter_time=1.110e-04, loss_ctc=80.656, loss_att=56.281, acc=0.709, loss=63.593, grad_norm=5.365, loss_scale=1.000, learning_rate=8.113e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:29:22,826 (deepspeed_trainer:228) INFO: 25epoch:train:4701-4800batch: iter_time=1.103e-04, loss_ctc=72.375, loss_att=51.848, acc=0.716, loss=58.008, grad_norm=4.814, loss_scale=1.000, learning_rate=8.112e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:30:00,395 (deepspeed_trainer:228) INFO: 25epoch:train:4801-4900batch: iter_time=1.108e-04, loss_ctc=76.329, loss_att=57.631, acc=0.709, loss=63.248, grad_norm=4.929, loss_scale=1.000, learning_rate=8.110e-05, step_time=0.375 [2024-12-07 15:30:37,592] [INFO] [logging.py:129:log_dist] [Rank 0] step=365000, skipped=0, lr=[np.float64(8.108837432804956e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:30:37,592] [INFO] [timer.py:264:stop] epoch=0/micro_step=110000/global_step=110000, RunningAvgSamplesPerSec=43.804335292727025, CurrSamplesPerSec=42.70643910424282, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:30:37,594 (deepspeed_trainer:228) INFO: 25epoch:train:4901-5000batch: iter_time=1.122e-04, loss_ctc=73.471, loss_att=58.081, acc=0.707, loss=62.713, grad_norm=4.801, loss_scale=1.000, learning_rate=8.109e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:31:14,902 (deepspeed_trainer:228) INFO: 25epoch:train:5001-5100batch: iter_time=1.084e-04, loss_ctc=72.220, loss_att=52.786, acc=0.717, loss=58.611, grad_norm=4.919, loss_scale=1.000, learning_rate=8.108e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:31:51,866 (deepspeed_trainer:228) INFO: 25epoch:train:5101-5200batch: iter_time=1.207e-04, loss_ctc=73.140, loss_att=51.455, acc=0.717, loss=57.944, grad_norm=4.596, loss_scale=1.000, learning_rate=8.107e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 15:32:28,818 (deepspeed_trainer:228) INFO: 25epoch:train:5201-5300batch: iter_time=1.096e-04, loss_ctc=71.994, loss_att=52.385, acc=0.713, loss=58.252, grad_norm=4.694, loss_scale=1.000, learning_rate=8.106e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 15:33:05,678 (deepspeed_trainer:228) INFO: 25epoch:train:5301-5400batch: iter_time=1.100e-04, loss_ctc=68.061, loss_att=48.115, acc=0.720, loss=54.097, grad_norm=4.506, loss_scale=1.000, learning_rate=8.105e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 15:33:43,062 (deepspeed_trainer:228) INFO: 25epoch:train:5401-5500batch: iter_time=1.099e-04, loss_ctc=73.184, loss_att=50.138, acc=0.709, loss=57.067, grad_norm=5.110, loss_scale=1.000, learning_rate=8.104e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:34:20,453 (deepspeed_trainer:228) INFO: 25epoch:train:5501-5600batch: iter_time=1.089e-04, loss_ctc=82.714, loss_att=63.747, acc=0.697, loss=69.436, grad_norm=5.064, loss_scale=1.000, learning_rate=8.103e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:34:34,504 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 15:35:00,623 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 15:35:17,811 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 15:35:17,811 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 15:35:17,813 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 15:35:42,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:43,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:43,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:43,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:43,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:43,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:43,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:44,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:44,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:44,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:45,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:45,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:46,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:46,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:47,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:35:49,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:31,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:31,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:32,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:32,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:32,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:32,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:33,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:33,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:33,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:35,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:35,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:36,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:36,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:36,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:36,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:36:38,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:19,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:19,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:19,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:19,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:20,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:22,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:22,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:22,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:22,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:24,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:24,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:25,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:25,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:26,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:26,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:37:27,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:07,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:07,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:07,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:08,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:09,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:09,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:11,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:11,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:11,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:14,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:14,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:15,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:15,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:16,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:16,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:38:16,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 15:39:11,496 (deepspeed_trainer:228) INFO: 25epoch:train:5601-5700batch: iter_time=2.519, loss_ctc=79.846, loss_att=58.242, acc=0.711, loss=64.718, grad_norm=5.127, loss_scale=1.000, learning_rate=8.102e-05, step_time=0.391 [cnode7-012:0/16] 2024-12-07 15:39:48,913 (deepspeed_trainer:228) INFO: 25epoch:train:5701-5800batch: iter_time=1.077e-04, loss_ctc=84.206, loss_att=67.953, acc=0.706, loss=72.873, grad_norm=5.678, loss_scale=1.000, learning_rate=8.101e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:40:27,574 (deepspeed_trainer:228) INFO: 25epoch:train:5801-5900batch: iter_time=1.116e-04, loss_ctc=76.336, loss_att=63.662, acc=0.701, loss=67.442, grad_norm=5.140, loss_scale=1.000, learning_rate=8.099e-05, step_time=0.387 [2024-12-07 15:41:05,279] [INFO] [logging.py:129:log_dist] [Rank 0] step=366000, skipped=0, lr=[np.float64(8.097752239293657e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:41:05,279] [INFO] [timer.py:264:stop] epoch=0/micro_step=111000/global_step=111000, RunningAvgSamplesPerSec=43.80317003740323, CurrSamplesPerSec=44.086492350142585, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:41:05,281 (deepspeed_trainer:228) INFO: 25epoch:train:5901-6000batch: iter_time=1.102e-04, loss_ctc=80.549, loss_att=65.418, acc=0.710, loss=69.948, grad_norm=4.984, loss_scale=1.000, learning_rate=8.098e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 15:41:42,684 (deepspeed_trainer:228) INFO: 25epoch:train:6001-6100batch: iter_time=1.084e-04, loss_ctc=73.203, loss_att=52.566, acc=0.711, loss=58.756, grad_norm=5.088, loss_scale=1.000, learning_rate=8.097e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:42:20,370 (deepspeed_trainer:228) INFO: 25epoch:train:6101-6200batch: iter_time=1.090e-04, loss_ctc=78.882, loss_att=53.978, acc=0.722, loss=61.434, grad_norm=4.573, loss_scale=1.000, learning_rate=8.096e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 15:42:57,491 (deepspeed_trainer:228) INFO: 25epoch:train:6201-6300batch: iter_time=1.103e-04, loss_ctc=73.314, loss_att=58.375, acc=0.700, loss=62.876, grad_norm=6.438, loss_scale=1.000, learning_rate=8.095e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 15:43:34,307 (deepspeed_trainer:228) INFO: 25epoch:train:6301-6400batch: iter_time=1.157e-04, loss_ctc=65.150, loss_att=46.085, acc=0.723, loss=51.788, grad_norm=4.446, loss_scale=1.000, learning_rate=8.094e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 15:44:11,608 (deepspeed_trainer:228) INFO: 25epoch:train:6401-6500batch: iter_time=1.191e-04, loss_ctc=74.064, loss_att=51.799, acc=0.710, loss=58.491, grad_norm=5.308, loss_scale=1.000, learning_rate=8.093e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:44:48,782 (deepspeed_trainer:228) INFO: 25epoch:train:6501-6600batch: iter_time=1.094e-04, loss_ctc=76.159, loss_att=54.787, acc=0.721, loss=61.205, grad_norm=5.313, loss_scale=1.000, learning_rate=8.092e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:45:26,222 (deepspeed_trainer:228) INFO: 25epoch:train:6601-6700batch: iter_time=1.121e-04, loss_ctc=75.696, loss_att=57.379, acc=0.710, loss=62.893, grad_norm=5.356, loss_scale=1.000, learning_rate=8.091e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:46:03,634 (deepspeed_trainer:228) INFO: 25epoch:train:6701-6800batch: iter_time=1.148e-04, loss_ctc=77.925, loss_att=57.651, acc=0.714, loss=63.683, grad_norm=5.257, loss_scale=1.000, learning_rate=8.089e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 15:46:40,862 (deepspeed_trainer:228) INFO: 25epoch:train:6801-6900batch: iter_time=1.161e-04, loss_ctc=66.218, loss_att=52.576, acc=0.716, loss=56.673, grad_norm=4.443, loss_scale=1.000, learning_rate=8.088e-05, step_time=0.372 [2024-12-07 15:47:18,240] [INFO] [logging.py:129:log_dist] [Rank 0] step=367000, skipped=0, lr=[np.float64(8.086712383908692e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:47:18,241] [INFO] [timer.py:264:stop] epoch=0/micro_step=112000/global_step=112000, RunningAvgSamplesPerSec=43.80444595008179, CurrSamplesPerSec=44.80740385956649, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:47:18,242 (deepspeed_trainer:228) INFO: 25epoch:train:6901-7000batch: iter_time=1.110e-04, loss_ctc=75.262, loss_att=58.258, acc=0.721, loss=63.361, grad_norm=4.714, loss_scale=1.000, learning_rate=8.087e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:47:55,784 (deepspeed_trainer:228) INFO: 25epoch:train:7001-7100batch: iter_time=1.125e-04, loss_ctc=71.181, loss_att=51.888, acc=0.713, loss=57.684, grad_norm=4.637, loss_scale=1.000, learning_rate=8.086e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 15:48:32,853 (deepspeed_trainer:228) INFO: 25epoch:train:7101-7200batch: iter_time=1.119e-04, loss_ctc=75.231, loss_att=52.495, acc=0.722, loss=59.333, grad_norm=6.132, loss_scale=1.000, learning_rate=8.085e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 15:49:09,696 (deepspeed_trainer:228) INFO: 25epoch:train:7201-7300batch: iter_time=1.187e-04, loss_ctc=65.774, loss_att=48.648, acc=0.717, loss=53.792, grad_norm=4.821, loss_scale=1.000, learning_rate=8.084e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 15:49:46,594 (deepspeed_trainer:228) INFO: 25epoch:train:7301-7400batch: iter_time=1.146e-04, loss_ctc=81.343, loss_att=57.157, acc=0.710, loss=64.422, grad_norm=4.926, loss_scale=1.000, learning_rate=8.083e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 15:50:23,758 (deepspeed_trainer:228) INFO: 25epoch:train:7401-7500batch: iter_time=1.142e-04, loss_ctc=79.438, loss_att=61.625, acc=0.711, loss=66.969, grad_norm=4.895, loss_scale=1.000, learning_rate=8.082e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 15:50:28,339 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 15:50:55,009 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 15:51:11,465 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 15:51:11,465 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 15:51:11,467 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 15:51:35,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:35,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:36,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:36,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:36,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:37,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:37,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:37,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:38,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:38,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:38,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:39,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:40,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:40,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:41,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:51:41,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:23,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:23,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:23,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:25,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:25,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:25,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:25,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:25,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:26,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:26,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:26,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:28,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:29,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:30,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:30,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:52:30,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:11,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:11,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:11,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:12,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:13,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:13,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:14,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:14,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:14,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:15,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:15,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:18,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:18,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:18,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:19,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:20,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:59,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:53:59,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:01,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:01,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:02,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:03,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:03,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:03,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:03,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:03,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:04,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:06,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:07,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:07,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:08,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 15:54:09,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 15:55:14,011 (deepspeed_trainer:228) INFO: 25epoch:train:7501-7600batch: iter_time=2.498, loss_ctc=73.201, loss_att=52.462, acc=0.720, loss=58.708, grad_norm=4.746, loss_scale=1.000, learning_rate=8.081e-05, step_time=0.404 [cnode7-012:0/16] 2024-12-07 15:55:51,310 (deepspeed_trainer:228) INFO: 25epoch:train:7601-7700batch: iter_time=1.181e-04, loss_ctc=81.966, loss_att=66.189, acc=0.710, loss=70.947, grad_norm=5.611, loss_scale=1.000, learning_rate=8.080e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:56:29,115 (deepspeed_trainer:228) INFO: 25epoch:train:7701-7800batch: iter_time=1.177e-04, loss_ctc=84.852, loss_att=70.154, acc=0.693, loss=74.558, grad_norm=6.263, loss_scale=1.000, learning_rate=8.078e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 15:57:06,506 (deepspeed_trainer:228) INFO: 25epoch:train:7801-7900batch: iter_time=1.203e-04, loss_ctc=74.596, loss_att=55.277, acc=0.723, loss=61.073, grad_norm=4.535, loss_scale=1.000, learning_rate=8.077e-05, step_time=0.374 [2024-12-07 15:57:43,792] [INFO] [logging.py:129:log_dist] [Rank 0] step=368000, skipped=0, lr=[np.float64(8.075717558437603e-05)], mom=[[0.9, 0.98]] [2024-12-07 15:57:43,793] [INFO] [timer.py:264:stop] epoch=0/micro_step=113000/global_step=113000, RunningAvgSamplesPerSec=43.80292247722455, CurrSamplesPerSec=46.32366025914877, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 15:57:43,794 (deepspeed_trainer:228) INFO: 25epoch:train:7901-8000batch: iter_time=1.201e-04, loss_ctc=71.479, loss_att=53.145, acc=0.714, loss=58.658, grad_norm=5.203, loss_scale=1.000, learning_rate=8.076e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 15:58:21,198 (deepspeed_trainer:228) INFO: 25epoch:train:8001-8100batch: iter_time=1.187e-04, loss_ctc=79.764, loss_att=55.562, acc=0.710, loss=62.850, grad_norm=6.066, loss_scale=1.000, learning_rate=8.075e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:58:58,511 (deepspeed_trainer:228) INFO: 25epoch:train:8101-8200batch: iter_time=1.227e-04, loss_ctc=70.833, loss_att=54.871, acc=0.712, loss=59.697, grad_norm=5.246, loss_scale=1.000, learning_rate=8.074e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 15:59:35,819 (deepspeed_trainer:228) INFO: 25epoch:train:8201-8300batch: iter_time=1.195e-04, loss_ctc=65.172, loss_att=46.376, acc=0.720, loss=52.024, grad_norm=5.086, loss_scale=1.000, learning_rate=8.073e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:00:13,246 (deepspeed_trainer:228) INFO: 25epoch:train:8301-8400batch: iter_time=1.206e-04, loss_ctc=76.454, loss_att=51.785, acc=0.723, loss=59.191, grad_norm=5.308, loss_scale=1.000, learning_rate=8.072e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:00:50,749 (deepspeed_trainer:228) INFO: 25epoch:train:8401-8500batch: iter_time=1.198e-04, loss_ctc=73.848, loss_att=53.609, acc=0.718, loss=59.686, grad_norm=5.184, loss_scale=1.000, learning_rate=8.071e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:01:28,131 (deepspeed_trainer:228) INFO: 25epoch:train:8501-8600batch: iter_time=1.196e-04, loss_ctc=78.488, loss_att=63.036, acc=0.702, loss=67.707, grad_norm=5.002, loss_scale=1.000, learning_rate=8.070e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:02:05,003 (deepspeed_trainer:228) INFO: 25epoch:train:8601-8700batch: iter_time=1.180e-04, loss_ctc=70.923, loss_att=49.680, acc=0.722, loss=56.057, grad_norm=4.530, loss_scale=1.000, learning_rate=8.069e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:02:42,192 (deepspeed_trainer:228) INFO: 25epoch:train:8701-8800batch: iter_time=1.208e-04, loss_ctc=70.885, loss_att=54.881, acc=0.725, loss=59.680, grad_norm=4.761, loss_scale=1.000, learning_rate=8.067e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:03:19,414 (deepspeed_trainer:228) INFO: 25epoch:train:8801-8900batch: iter_time=1.189e-04, loss_ctc=77.095, loss_att=56.917, acc=0.723, loss=62.988, grad_norm=4.938, loss_scale=1.000, learning_rate=8.066e-05, step_time=0.372 [2024-12-07 16:03:56,245] [INFO] [logging.py:129:log_dist] [Rank 0] step=369000, skipped=0, lr=[np.float64(8.064767457593339e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:03:56,246] [INFO] [timer.py:264:stop] epoch=0/micro_step=114000/global_step=114000, RunningAvgSamplesPerSec=43.805164078273265, CurrSamplesPerSec=45.57533140950505, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:03:56,247 (deepspeed_trainer:228) INFO: 25epoch:train:8901-9000batch: iter_time=1.202e-04, loss_ctc=67.123, loss_att=49.735, acc=0.712, loss=54.970, grad_norm=4.779, loss_scale=1.000, learning_rate=8.065e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 16:04:33,139 (deepspeed_trainer:228) INFO: 25epoch:train:9001-9100batch: iter_time=1.110e-04, loss_ctc=73.576, loss_att=53.905, acc=0.719, loss=59.832, grad_norm=5.351, loss_scale=1.000, learning_rate=8.064e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:05:09,962 (deepspeed_trainer:228) INFO: 25epoch:train:9101-9200batch: iter_time=1.117e-04, loss_ctc=65.351, loss_att=49.556, acc=0.719, loss=54.276, grad_norm=5.017, loss_scale=1.000, learning_rate=8.063e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 16:05:47,216 (deepspeed_trainer:228) INFO: 25epoch:train:9201-9300batch: iter_time=1.151e-04, loss_ctc=86.096, loss_att=61.037, acc=0.715, loss=68.560, grad_norm=5.037, loss_scale=1.000, learning_rate=8.062e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 16:06:19,842 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 16:06:46,459 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 16:07:02,712 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 16:07:02,712 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 16:07:02,715 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 16:07:27,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:28,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:28,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:28,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:28,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:29,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:29,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:29,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:30,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:30,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:30,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:31,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:31,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:31,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:32,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:07:33,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:15,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:16,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:17,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:17,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:17,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:17,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:18,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:18,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:18,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:18,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:19,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:20,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:21,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:21,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:22,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:08:23,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:03,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:04,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:05,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:05,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:05,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:06,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:07,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:06,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:07,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:07,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:07,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:09,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:11,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:11,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:12,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:13,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:51,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:52,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:53,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:53,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:53,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:54,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:55,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:55,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:55,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:58,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:59,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:09:59,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:10:01,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:10:01,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:10:01,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:10:02,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 16:10:38,495 (deepspeed_trainer:228) INFO: 25epoch:train:9301-9400batch: iter_time=2.532, loss_ctc=72.631, loss_att=52.493, acc=0.721, loss=58.539, grad_norm=4.892, loss_scale=1.000, learning_rate=8.061e-05, step_time=0.381 [cnode7-012:0/16] 2024-12-07 16:11:15,909 (deepspeed_trainer:228) INFO: 25epoch:train:9401-9500batch: iter_time=1.110e-04, loss_ctc=86.239, loss_att=64.858, acc=0.708, loss=71.293, grad_norm=6.277, loss_scale=1.000, learning_rate=8.060e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:11:53,212 (deepspeed_trainer:228) INFO: 25epoch:train:9501-9600batch: iter_time=1.078e-04, loss_ctc=73.912, loss_att=59.789, acc=0.714, loss=64.020, grad_norm=5.031, loss_scale=1.000, learning_rate=8.059e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 16:12:30,786 (deepspeed_trainer:228) INFO: 25epoch:train:9601-9700batch: iter_time=1.115e-04, loss_ctc=80.404, loss_att=68.361, acc=0.696, loss=71.931, grad_norm=5.982, loss_scale=1.000, learning_rate=8.058e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 16:13:08,076 (deepspeed_trainer:228) INFO: 25epoch:train:9701-9800batch: iter_time=1.098e-04, loss_ctc=74.818, loss_att=56.748, acc=0.717, loss=62.192, grad_norm=4.898, loss_scale=1.000, learning_rate=8.057e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:13:45,639 (deepspeed_trainer:228) INFO: 25epoch:train:9801-9900batch: iter_time=1.090e-04, loss_ctc=75.562, loss_att=54.755, acc=0.725, loss=61.000, grad_norm=5.164, loss_scale=1.000, learning_rate=8.055e-05, step_time=0.375 [2024-12-07 16:14:23,035] [INFO] [logging.py:129:log_dist] [Rank 0] step=370000, skipped=0, lr=[np.float64(8.053861778978647e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:14:23,036] [INFO] [timer.py:264:stop] epoch=0/micro_step=115000/global_step=115000, RunningAvgSamplesPerSec=43.806179988821356, CurrSamplesPerSec=45.62273625197822, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:14:23,037 (deepspeed_trainer:228) INFO: 25epoch:train:9901-10000batch: iter_time=1.100e-04, loss_ctc=73.681, loss_att=52.480, acc=0.705, loss=58.833, grad_norm=5.519, loss_scale=1.000, learning_rate=8.054e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:15:00,490 (deepspeed_trainer:228) INFO: 25epoch:train:10001-10100batch: iter_time=1.095e-04, loss_ctc=72.754, loss_att=50.834, acc=0.722, loss=57.379, grad_norm=6.003, loss_scale=1.000, learning_rate=8.053e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:15:37,773 (deepspeed_trainer:228) INFO: 25epoch:train:10101-10200batch: iter_time=1.078e-04, loss_ctc=69.652, loss_att=50.375, acc=0.718, loss=56.172, grad_norm=4.927, loss_scale=1.000, learning_rate=8.052e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:16:15,104 (deepspeed_trainer:228) INFO: 25epoch:train:10201-10300batch: iter_time=1.119e-04, loss_ctc=74.336, loss_att=50.227, acc=0.717, loss=57.421, grad_norm=4.950, loss_scale=1.000, learning_rate=8.051e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:16:52,760 (deepspeed_trainer:228) INFO: 25epoch:train:10301-10400batch: iter_time=1.083e-04, loss_ctc=73.846, loss_att=52.893, acc=0.719, loss=59.183, grad_norm=4.735, loss_scale=1.000, learning_rate=8.050e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 16:17:30,292 (deepspeed_trainer:228) INFO: 25epoch:train:10401-10500batch: iter_time=1.073e-04, loss_ctc=78.116, loss_att=61.906, acc=0.712, loss=66.764, grad_norm=4.953, loss_scale=1.000, learning_rate=8.049e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 16:18:07,698 (deepspeed_trainer:228) INFO: 25epoch:train:10501-10600batch: iter_time=1.091e-04, loss_ctc=72.049, loss_att=54.939, acc=0.716, loss=60.062, grad_norm=4.391, loss_scale=1.000, learning_rate=8.048e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:18:45,185 (deepspeed_trainer:228) INFO: 25epoch:train:10601-10700batch: iter_time=1.082e-04, loss_ctc=66.558, loss_att=51.502, acc=0.729, loss=56.029, grad_norm=5.084, loss_scale=1.000, learning_rate=8.047e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 16:19:22,623 (deepspeed_trainer:228) INFO: 25epoch:train:10701-10800batch: iter_time=1.087e-04, loss_ctc=72.970, loss_att=52.827, acc=0.720, loss=58.852, grad_norm=5.091, loss_scale=1.000, learning_rate=8.046e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:19:59,907 (deepspeed_trainer:228) INFO: 25epoch:train:10801-10900batch: iter_time=1.063e-04, loss_ctc=71.934, loss_att=50.728, acc=0.721, loss=57.100, grad_norm=4.787, loss_scale=1.000, learning_rate=8.045e-05, step_time=0.372 [2024-12-07 16:20:37,171] [INFO] [logging.py:129:log_dist] [Rank 0] step=371000, skipped=0, lr=[np.float64(8.043000223051e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:20:37,172] [INFO] [timer.py:264:stop] epoch=0/micro_step=116000/global_step=116000, RunningAvgSamplesPerSec=43.806802073342716, CurrSamplesPerSec=46.876099202014515, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:20:37,173 (deepspeed_trainer:228) INFO: 25epoch:train:10901-11000batch: iter_time=1.074e-04, loss_ctc=74.661, loss_att=57.096, acc=0.713, loss=62.373, grad_norm=4.939, loss_scale=1.000, learning_rate=8.044e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 16:21:14,323 (deepspeed_trainer:228) INFO: 25epoch:train:11001-11100batch: iter_time=1.084e-04, loss_ctc=66.299, loss_att=48.352, acc=0.725, loss=53.742, grad_norm=4.335, loss_scale=1.000, learning_rate=8.042e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:21:52,309 (deepspeed_trainer:228) INFO: 25epoch:train:11101-11200batch: iter_time=1.078e-04, loss_ctc=86.408, loss_att=64.294, acc=0.701, loss=70.906, grad_norm=6.201, loss_scale=1.000, learning_rate=8.041e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 16:22:15,439 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 16:22:42,552 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 16:22:58,951 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 16:22:58,951 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 16:22:58,953 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 16:23:22,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:23,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:23,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:23,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:24,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:25,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:25,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:25,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:25,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:26,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:26,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:27,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:28,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:29,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:29,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:23:29,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:10,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:11,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:12,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:11,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:12,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:13,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:13,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:14,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:14,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:14,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:15,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:16,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:17,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:17,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:18,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:19,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:57,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:59,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:24:59,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:01,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:01,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:01,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:03,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:03,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:03,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:03,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:04,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:05,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:07,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:08,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:08,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:09,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:44,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:47,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:47,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:48,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:48,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:49,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:50,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:51,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:51,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:51,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:53,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:54,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:56,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:56,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:25:57,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:26:00,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 16:26:45,884 (deepspeed_trainer:228) INFO: 25epoch:train:11201-11300batch: iter_time=2.496, loss_ctc=76.672, loss_att=54.204, acc=0.721, loss=60.949, grad_norm=4.644, loss_scale=1.000, learning_rate=8.040e-05, step_time=0.439 [cnode7-012:0/16] 2024-12-07 16:27:23,386 (deepspeed_trainer:228) INFO: 25epoch:train:11301-11400batch: iter_time=1.066e-04, loss_ctc=86.345, loss_att=65.105, acc=0.714, loss=71.501, grad_norm=6.638, loss_scale=1.000, learning_rate=8.039e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 16:28:00,238 (deepspeed_trainer:228) INFO: 25epoch:train:11401-11500batch: iter_time=1.062e-04, loss_ctc=73.195, loss_att=57.267, acc=0.713, loss=62.044, grad_norm=5.890, loss_scale=1.000, learning_rate=8.038e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 16:28:38,017 (deepspeed_trainer:228) INFO: 25epoch:train:11501-11600batch: iter_time=1.105e-04, loss_ctc=75.879, loss_att=62.988, acc=0.708, loss=66.848, grad_norm=4.637, loss_scale=1.000, learning_rate=8.037e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 16:29:15,285 (deepspeed_trainer:228) INFO: 25epoch:train:11601-11700batch: iter_time=1.061e-04, loss_ctc=76.912, loss_att=57.452, acc=0.709, loss=63.315, grad_norm=5.052, loss_scale=1.000, learning_rate=8.036e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 16:29:52,762 (deepspeed_trainer:228) INFO: 25epoch:train:11701-11800batch: iter_time=1.067e-04, loss_ctc=80.920, loss_att=56.618, acc=0.728, loss=63.907, grad_norm=4.533, loss_scale=1.000, learning_rate=8.035e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 16:30:29,605 (deepspeed_trainer:228) INFO: 25epoch:train:11801-11900batch: iter_time=1.066e-04, loss_ctc=70.260, loss_att=53.310, acc=0.703, loss=58.390, grad_norm=5.564, loss_scale=1.000, learning_rate=8.034e-05, step_time=0.368 [2024-12-07 16:31:07,151] [INFO] [logging.py:129:log_dist] [Rank 0] step=372000, skipped=0, lr=[np.float64(8.032182493088034e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:31:07,152] [INFO] [timer.py:264:stop] epoch=0/micro_step=117000/global_step=117000, RunningAvgSamplesPerSec=43.800640220652376, CurrSamplesPerSec=41.18391253089985, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:31:07,153 (deepspeed_trainer:228) INFO: 25epoch:train:11901-12000batch: iter_time=1.063e-04, loss_ctc=68.879, loss_att=49.447, acc=0.723, loss=55.281, grad_norm=4.942, loss_scale=1.000, learning_rate=8.033e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 16:31:44,210 (deepspeed_trainer:228) INFO: 25epoch:train:12001-12100batch: iter_time=1.065e-04, loss_ctc=67.166, loss_att=46.930, acc=0.719, loss=53.006, grad_norm=4.902, loss_scale=1.000, learning_rate=8.032e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 16:32:21,159 (deepspeed_trainer:228) INFO: 25epoch:train:12101-12200batch: iter_time=1.105e-04, loss_ctc=78.392, loss_att=55.752, acc=0.715, loss=62.519, grad_norm=5.720, loss_scale=1.000, learning_rate=8.031e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:32:58,164 (deepspeed_trainer:228) INFO: 25epoch:train:12201-12300batch: iter_time=1.078e-04, loss_ctc=71.407, loss_att=51.756, acc=0.719, loss=57.642, grad_norm=4.729, loss_scale=1.000, learning_rate=8.029e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 16:33:35,419 (deepspeed_trainer:228) INFO: 25epoch:train:12301-12400batch: iter_time=1.080e-04, loss_ctc=75.271, loss_att=57.727, acc=0.715, loss=63.031, grad_norm=5.562, loss_scale=1.000, learning_rate=8.028e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 16:34:12,594 (deepspeed_trainer:228) INFO: 25epoch:train:12401-12500batch: iter_time=1.088e-04, loss_ctc=72.767, loss_att=58.065, acc=0.716, loss=62.470, grad_norm=4.708, loss_scale=1.000, learning_rate=8.027e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:34:50,580 (deepspeed_trainer:228) INFO: 25epoch:train:12501-12600batch: iter_time=1.063e-04, loss_ctc=70.733, loss_att=52.404, acc=0.728, loss=57.869, grad_norm=4.825, loss_scale=1.000, learning_rate=8.026e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 16:35:27,551 (deepspeed_trainer:228) INFO: 25epoch:train:12601-12700batch: iter_time=1.098e-04, loss_ctc=72.207, loss_att=51.957, acc=0.721, loss=58.034, grad_norm=4.520, loss_scale=1.000, learning_rate=8.025e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:36:04,490 (deepspeed_trainer:228) INFO: 25epoch:train:12701-12800batch: iter_time=1.077e-04, loss_ctc=71.323, loss_att=52.519, acc=0.719, loss=58.166, grad_norm=4.712, loss_scale=1.000, learning_rate=8.024e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:36:41,228 (deepspeed_trainer:228) INFO: 25epoch:train:12801-12900batch: iter_time=1.065e-04, loss_ctc=67.722, loss_att=49.283, acc=0.725, loss=54.808, grad_norm=4.490, loss_scale=1.000, learning_rate=8.023e-05, step_time=0.367 [2024-12-07 16:37:18,238] [INFO] [logging.py:129:log_dist] [Rank 0] step=373000, skipped=0, lr=[np.float64(8.02140829515351e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:37:18,239] [INFO] [timer.py:264:stop] epoch=0/micro_step=118000/global_step=118000, RunningAvgSamplesPerSec=43.803858566058246, CurrSamplesPerSec=42.70684676812851, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:37:18,240 (deepspeed_trainer:228) INFO: 25epoch:train:12901-13000batch: iter_time=1.059e-04, loss_ctc=72.374, loss_att=49.904, acc=0.717, loss=56.644, grad_norm=4.985, loss_scale=1.000, learning_rate=8.022e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:37:55,753 (deepspeed_trainer:228) INFO: 25epoch:train:13001-13100batch: iter_time=1.080e-04, loss_ctc=81.969, loss_att=63.085, acc=0.705, loss=68.735, grad_norm=4.851, loss_scale=1.000, learning_rate=8.021e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 16:38:09,513 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 16:38:36,422 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 16:38:52,220 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 16:38:52,220 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 16:38:52,223 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 16:39:17,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:18,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:18,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:18,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:18,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:19,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:19,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:19,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:20,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:20,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:20,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:20,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:22,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:22,603] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:22,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:39:23,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:05,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:06,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:06,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:07,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:07,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:07,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:08,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:08,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:08,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:09,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:09,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:09,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:11,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:12,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:12,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:12,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:52,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:53,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:54,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:55,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:54,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:55,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:56,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:56,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:56,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:57,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:56,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:58,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:59,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:59,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:40:59,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:03,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:39,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:39,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:41,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:40,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:43,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:43,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:44,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:44,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:45,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:46,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:47,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:48,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:48,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:49,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:49,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:41:54,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 16:42:48,823 (deepspeed_trainer:228) INFO: 25epoch:train:13101-13200batch: iter_time=2.495, loss_ctc=79.073, loss_att=56.406, acc=0.713, loss=63.236, grad_norm=5.264, loss_scale=1.000, learning_rate=8.020e-05, step_time=0.453 [cnode7-012:0/16] 2024-12-07 16:43:25,945 (deepspeed_trainer:228) INFO: 25epoch:train:13201-13300batch: iter_time=1.032e-04, loss_ctc=82.311, loss_att=65.853, acc=0.704, loss=70.801, grad_norm=5.957, loss_scale=1.000, learning_rate=8.019e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:44:03,199 (deepspeed_trainer:228) INFO: 25epoch:train:13301-13400batch: iter_time=1.105e-04, loss_ctc=75.897, loss_att=63.426, acc=0.694, loss=67.153, grad_norm=6.001, loss_scale=1.000, learning_rate=8.018e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 16:44:40,480 (deepspeed_trainer:228) INFO: 25epoch:train:13401-13500batch: iter_time=1.106e-04, loss_ctc=79.519, loss_att=64.104, acc=0.706, loss=68.743, grad_norm=4.907, loss_scale=1.000, learning_rate=8.017e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 16:45:17,522 (deepspeed_trainer:228) INFO: 25epoch:train:13501-13600batch: iter_time=1.088e-04, loss_ctc=71.790, loss_att=52.481, acc=0.710, loss=58.261, grad_norm=5.142, loss_scale=1.000, learning_rate=8.015e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 16:45:54,428 (deepspeed_trainer:228) INFO: 25epoch:train:13601-13700batch: iter_time=1.114e-04, loss_ctc=77.642, loss_att=52.561, acc=0.720, loss=60.112, grad_norm=5.036, loss_scale=1.000, learning_rate=8.014e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:46:31,380 (deepspeed_trainer:228) INFO: 25epoch:train:13701-13800batch: iter_time=1.082e-04, loss_ctc=72.452, loss_att=55.674, acc=0.705, loss=60.694, grad_norm=5.735, loss_scale=1.000, learning_rate=8.013e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:47:08,328 (deepspeed_trainer:228) INFO: 25epoch:train:13801-13900batch: iter_time=1.104e-04, loss_ctc=65.326, loss_att=46.150, acc=0.722, loss=51.907, grad_norm=4.642, loss_scale=1.000, learning_rate=8.012e-05, step_time=0.369 [2024-12-07 16:47:45,369] [INFO] [logging.py:129:log_dist] [Rank 0] step=374000, skipped=0, lr=[np.float64(8.010677338063768e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:47:45,370] [INFO] [timer.py:264:stop] epoch=0/micro_step=119000/global_step=119000, RunningAvgSamplesPerSec=43.7987549519755, CurrSamplesPerSec=42.99969862213222, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:47:45,371 (deepspeed_trainer:228) INFO: 25epoch:train:13901-14000batch: iter_time=1.119e-04, loss_ctc=72.806, loss_att=50.985, acc=0.711, loss=57.525, grad_norm=4.880, loss_scale=1.000, learning_rate=8.011e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 16:48:22,554 (deepspeed_trainer:228) INFO: 25epoch:train:14001-14100batch: iter_time=1.097e-04, loss_ctc=75.834, loss_att=53.964, acc=0.719, loss=60.521, grad_norm=5.308, loss_scale=1.000, learning_rate=8.010e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:48:59,713 (deepspeed_trainer:228) INFO: 25epoch:train:14101-14200batch: iter_time=1.113e-04, loss_ctc=74.823, loss_att=56.296, acc=0.711, loss=61.835, grad_norm=4.523, loss_scale=1.000, learning_rate=8.009e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:49:36,632 (deepspeed_trainer:228) INFO: 25epoch:train:14201-14300batch: iter_time=1.074e-04, loss_ctc=76.498, loss_att=56.909, acc=0.712, loss=62.793, grad_norm=5.302, loss_scale=1.000, learning_rate=8.008e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 16:50:13,465 (deepspeed_trainer:228) INFO: 25epoch:train:14301-14400batch: iter_time=1.104e-04, loss_ctc=65.917, loss_att=50.976, acc=0.714, loss=55.451, grad_norm=4.584, loss_scale=1.000, learning_rate=8.007e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 16:50:50,658 (deepspeed_trainer:228) INFO: 25epoch:train:14401-14500batch: iter_time=1.089e-04, loss_ctc=73.948, loss_att=56.354, acc=0.720, loss=61.666, grad_norm=4.637, loss_scale=1.000, learning_rate=8.006e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 16:51:28,698 (deepspeed_trainer:228) INFO: 25epoch:train:14501-14600batch: iter_time=1.059e-04, loss_ctc=70.291, loss_att=50.557, acc=0.713, loss=56.512, grad_norm=4.284, loss_scale=1.000, learning_rate=8.005e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-07 16:52:05,453 (deepspeed_trainer:228) INFO: 25epoch:train:14601-14700batch: iter_time=1.068e-04, loss_ctc=73.744, loss_att=51.544, acc=0.722, loss=58.193, grad_norm=5.649, loss_scale=1.000, learning_rate=8.004e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 16:52:42,041 (deepspeed_trainer:228) INFO: 25epoch:train:14701-14800batch: iter_time=1.079e-04, loss_ctc=65.504, loss_att=46.700, acc=0.720, loss=52.339, grad_norm=4.708, loss_scale=1.000, learning_rate=8.003e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 16:53:18,739 (deepspeed_trainer:228) INFO: 25epoch:train:14801-14900batch: iter_time=1.064e-04, loss_ctc=80.389, loss_att=56.349, acc=0.703, loss=63.538, grad_norm=5.003, loss_scale=1.000, learning_rate=8.002e-05, step_time=0.367 [2024-12-07 16:53:55,486] [INFO] [logging.py:129:log_dist] [Rank 0] step=375000, skipped=0, lr=[np.float64(7.999989333354667e-05)], mom=[[0.9, 0.98]] [2024-12-07 16:53:55,487] [INFO] [timer.py:264:stop] epoch=0/micro_step=120000/global_step=120000, RunningAvgSamplesPerSec=43.80312271205331, CurrSamplesPerSec=43.23029792771907, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 16:53:55,488 (deepspeed_trainer:228) INFO: 25epoch:train:14901-15000batch: iter_time=1.064e-04, loss_ctc=78.643, loss_att=61.078, acc=0.705, loss=66.353, grad_norm=5.219, loss_scale=1.000, learning_rate=8.001e-05, step_time=0.367 [2024-12-07 16:54:11,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:12,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:11,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:12,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:12,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:12,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:13,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:27,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:28,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:27,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:28,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:28,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:28,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:30,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:30,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:30,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:29,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:42,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:43,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:44,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:44,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:44,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:45,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:45,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:44,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:45,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:46,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:46,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:46,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:45,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:46,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:46,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:54:46,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:01,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:02,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:02,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:03,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:03,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:03,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:03,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:03,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:04,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:04,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:04,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:05,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:06,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:06,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:06,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:07,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:55:20,368] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 25 is about to be saved! [2024-12-07 16:55:20,398] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/mp_rank_00_model_states.pt [2024-12-07 16:55:20,398] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/mp_rank_00_model_states.pt... [2024-12-07 16:55:22,270] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/mp_rank_00_model_states.pt. [2024-12-07 16:55:23,002] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,426] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,004] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,429] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,006] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,007] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,432] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,008] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,432] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,008] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,433] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,009] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,433] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,009] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,434] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 16:55:22,434] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 16:55:23,187] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,187] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,188] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,763] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,764] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,764] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,770] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,199] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,199] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,212] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,793] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,793] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,793] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,799] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,799] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,799] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,804] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,804] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,804] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,242] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,243] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,243] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,821] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,821] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,821] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,246] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,247] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,247] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,257] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,258] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,258] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,258] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,258] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,258] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,838] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,838] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,838] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,844] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,844] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,844] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,268] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 16:55:23,268] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,268] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [2024-12-07 16:55:23,396] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_25/25/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 16:55:23,397] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 25 is ready now! [cnode7-012:0/16] 2024-12-07 16:55:23,552 (deepspeed_trainer:158) INFO: 25epoch results: [train] iter_time=0.133, loss_ctc=75.038, loss_att=55.449, acc=0.712, loss=61.326, grad_norm=5.130, loss_scale=1.000, learning_rate=8.082e-05, step_time=0.374, time=2 hours, 7 minutes and 6.27 seconds, total_count=375025, gpu_max_cached_mem_GB=28.912, [valid] loss_ctc=5.156, cer_ctc=0.127, loss_att=6.719, acc=0.817, cer=0.394, wer=1.000, loss=6.250, time=1 minute and 14.69 seconds, total_count=25, gpu_max_cached_mem_GB=28.912 [cnode7-012:0/16] 2024-12-07 16:55:26,115 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 16:55:52,304 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 16:56:08,392 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 16:56:08,393 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 16:56:08,395 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 16:56:28,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:29,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:30,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:30,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:31,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:31,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:32,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:31,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:32,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:33,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:32,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:33,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:34,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:34,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:34,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:56:34,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:16,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:17,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:19,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:19,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:20,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:20,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:19,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:20,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:20,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:21,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:20,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:21,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:21,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:22,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:22,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:57:22,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:03,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:06,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:08,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:08,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:08,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:08,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:08,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:09,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:08,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:09,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:09,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:10,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:09,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:10,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:10,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:12,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:51,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:52,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:56,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:56,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:56,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:56,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:56,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:57,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:56,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:57,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:57,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:58,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:58,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:58,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:58:59,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 16:59:02,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 17:00:04,747 (deepspeed_trainer:228) INFO: 26epoch:train:1-100batch: iter_time=2.397, loss_ctc=71.653, loss_att=49.215, acc=0.718, loss=55.951, grad_norm=5.065, loss_scale=1.000, learning_rate=7.999e-05, step_time=0.388 [cnode7-012:0/16] 2024-12-07 17:00:42,229 (deepspeed_trainer:228) INFO: 26epoch:train:101-200batch: iter_time=1.092e-04, loss_ctc=76.360, loss_att=63.460, acc=0.703, loss=67.341, grad_norm=5.217, loss_scale=1.000, learning_rate=7.998e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 17:01:19,468 (deepspeed_trainer:228) INFO: 26epoch:train:201-300batch: iter_time=1.077e-04, loss_ctc=71.844, loss_att=53.898, acc=0.706, loss=59.295, grad_norm=5.414, loss_scale=1.000, learning_rate=7.997e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 17:01:56,679 (deepspeed_trainer:228) INFO: 26epoch:train:301-400batch: iter_time=1.102e-04, loss_ctc=76.495, loss_att=59.032, acc=0.707, loss=64.263, grad_norm=4.889, loss_scale=1.000, learning_rate=7.996e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 17:02:33,644 (deepspeed_trainer:228) INFO: 26epoch:train:401-500batch: iter_time=1.046e-04, loss_ctc=72.786, loss_att=51.630, acc=0.713, loss=57.994, grad_norm=5.328, loss_scale=1.000, learning_rate=7.995e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 17:03:11,253 (deepspeed_trainer:228) INFO: 26epoch:train:501-600batch: iter_time=1.088e-04, loss_ctc=73.549, loss_att=54.444, acc=0.717, loss=60.191, grad_norm=4.590, loss_scale=1.000, learning_rate=7.994e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:03:48,765 (deepspeed_trainer:228) INFO: 26epoch:train:601-700batch: iter_time=1.107e-04, loss_ctc=73.946, loss_att=54.613, acc=0.722, loss=60.417, grad_norm=4.593, loss_scale=1.000, learning_rate=7.993e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 17:04:26,653 (deepspeed_trainer:228) INFO: 26epoch:train:701-800batch: iter_time=1.099e-04, loss_ctc=71.494, loss_att=57.624, acc=0.701, loss=61.773, grad_norm=5.747, loss_scale=1.000, learning_rate=7.992e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 17:05:04,095 (deepspeed_trainer:228) INFO: 26epoch:train:801-900batch: iter_time=1.068e-04, loss_ctc=78.415, loss_att=55.782, acc=0.715, loss=62.578, grad_norm=5.278, loss_scale=1.000, learning_rate=7.991e-05, step_time=0.374 [2024-12-07 17:05:41,695] [INFO] [logging.py:129:log_dist] [Rank 0] step=376000, skipped=0, lr=[np.float64(7.989343995249023e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:05:41,696] [INFO] [timer.py:264:stop] epoch=0/micro_step=121000/global_step=121000, RunningAvgSamplesPerSec=43.80158664544518, CurrSamplesPerSec=43.98656813920329, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 17:05:41,698 (deepspeed_trainer:228) INFO: 26epoch:train:901-1000batch: iter_time=1.044e-04, loss_ctc=65.912, loss_att=51.195, acc=0.702, loss=55.617, grad_norm=5.067, loss_scale=1.000, learning_rate=7.990e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:06:19,175 (deepspeed_trainer:228) INFO: 26epoch:train:1001-1100batch: iter_time=1.084e-04, loss_ctc=73.999, loss_att=58.073, acc=0.725, loss=62.866, grad_norm=4.611, loss_scale=1.000, learning_rate=7.989e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 17:06:56,838 (deepspeed_trainer:228) INFO: 26epoch:train:1101-1200batch: iter_time=1.057e-04, loss_ctc=79.587, loss_att=67.631, acc=0.702, loss=71.184, grad_norm=4.902, loss_scale=1.000, learning_rate=7.988e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:07:34,180 (deepspeed_trainer:228) INFO: 26epoch:train:1201-1300batch: iter_time=1.116e-04, loss_ctc=68.743, loss_att=52.639, acc=0.708, loss=57.448, grad_norm=4.716, loss_scale=1.000, learning_rate=7.987e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 17:08:11,938 (deepspeed_trainer:228) INFO: 26epoch:train:1301-1400batch: iter_time=1.077e-04, loss_ctc=70.069, loss_att=54.579, acc=0.715, loss=59.228, grad_norm=4.786, loss_scale=1.000, learning_rate=7.986e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 17:08:49,408 (deepspeed_trainer:228) INFO: 26epoch:train:1401-1500batch: iter_time=1.056e-04, loss_ctc=67.454, loss_att=46.866, acc=0.717, loss=53.041, grad_norm=4.653, loss_scale=1.000, learning_rate=7.985e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 17:09:26,610 (deepspeed_trainer:228) INFO: 26epoch:train:1501-1600batch: iter_time=1.057e-04, loss_ctc=70.736, loss_att=48.817, acc=0.723, loss=55.392, grad_norm=4.309, loss_scale=1.000, learning_rate=7.984e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 17:10:03,947 (deepspeed_trainer:228) INFO: 26epoch:train:1601-1700batch: iter_time=1.064e-04, loss_ctc=70.000, loss_att=56.728, acc=0.717, loss=60.705, grad_norm=5.073, loss_scale=1.000, learning_rate=7.982e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 17:10:40,940 (deepspeed_trainer:228) INFO: 26epoch:train:1701-1800batch: iter_time=1.055e-04, loss_ctc=67.172, loss_att=46.295, acc=0.713, loss=52.563, grad_norm=5.607, loss_scale=1.000, learning_rate=7.981e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:11:14,220 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 17:11:41,011 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 17:11:58,105 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 17:11:58,105 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 17:11:58,107 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 17:12:24,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:24,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:24,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:26,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:27,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:27,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:27,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:28,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:29,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:12:30,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:14,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:14,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:16,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:16,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:17,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:17,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:18,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:18,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:18,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:19,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:19,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:19,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:19,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:20,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:20,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:13:20,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:01,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:02,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:04,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:04,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:05,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:06,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:05,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:06,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:06,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:06,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:07,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:06,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:07,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:07,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:07,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:09,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:48,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:48,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:52,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:52,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:53,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:54,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:54,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:54,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:55,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:55,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:56,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:56,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:56,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:57,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:56,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:14:57,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 17:15:33,759 (deepspeed_trainer:228) INFO: 26epoch:train:1801-1900batch: iter_time=2.509, loss_ctc=84.303, loss_att=60.135, acc=0.700, loss=67.401, grad_norm=6.614, loss_scale=1.000, learning_rate=7.980e-05, step_time=0.419 [2024-12-07 17:16:11,578] [INFO] [logging.py:129:log_dist] [Rank 0] step=377000, skipped=0, lr=[np.float64(7.978741040624512e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:16:11,581] [INFO] [timer.py:264:stop] epoch=0/micro_step=122000/global_step=122000, RunningAvgSamplesPerSec=43.79719103384086, CurrSamplesPerSec=40.216587294287976, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 17:16:11,583 (deepspeed_trainer:228) INFO: 26epoch:train:1901-2000batch: iter_time=1.058e-04, loss_ctc=69.403, loss_att=50.640, acc=0.716, loss=56.274, grad_norm=4.939, loss_scale=1.000, learning_rate=7.979e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 17:16:49,133 (deepspeed_trainer:228) INFO: 26epoch:train:2001-2100batch: iter_time=1.059e-04, loss_ctc=73.049, loss_att=60.266, acc=0.700, loss=64.078, grad_norm=5.183, loss_scale=1.000, learning_rate=7.978e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 17:17:26,267 (deepspeed_trainer:228) INFO: 26epoch:train:2101-2200batch: iter_time=1.087e-04, loss_ctc=70.710, loss_att=54.119, acc=0.700, loss=59.100, grad_norm=5.027, loss_scale=1.000, learning_rate=7.977e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 17:18:03,445 (deepspeed_trainer:228) INFO: 26epoch:train:2201-2300batch: iter_time=1.070e-04, loss_ctc=77.908, loss_att=59.059, acc=0.697, loss=64.669, grad_norm=5.815, loss_scale=1.000, learning_rate=7.976e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 17:18:40,515 (deepspeed_trainer:228) INFO: 26epoch:train:2301-2400batch: iter_time=1.098e-04, loss_ctc=70.790, loss_att=51.646, acc=0.720, loss=57.383, grad_norm=4.579, loss_scale=1.000, learning_rate=7.975e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:19:17,610 (deepspeed_trainer:228) INFO: 26epoch:train:2401-2500batch: iter_time=1.090e-04, loss_ctc=74.043, loss_att=55.041, acc=0.713, loss=60.746, grad_norm=5.043, loss_scale=1.000, learning_rate=7.974e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:19:55,444 (deepspeed_trainer:228) INFO: 26epoch:train:2501-2600batch: iter_time=1.088e-04, loss_ctc=71.364, loss_att=54.977, acc=0.721, loss=59.913, grad_norm=4.785, loss_scale=1.000, learning_rate=7.973e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 17:20:33,242 (deepspeed_trainer:228) INFO: 26epoch:train:2601-2700batch: iter_time=1.075e-04, loss_ctc=73.562, loss_att=55.113, acc=0.705, loss=60.676, grad_norm=5.226, loss_scale=1.000, learning_rate=7.972e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 17:21:10,001 (deepspeed_trainer:228) INFO: 26epoch:train:2701-2800batch: iter_time=1.075e-04, loss_ctc=70.106, loss_att=50.655, acc=0.707, loss=56.506, grad_norm=5.329, loss_scale=1.000, learning_rate=7.971e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 17:21:46,925 (deepspeed_trainer:228) INFO: 26epoch:train:2801-2900batch: iter_time=1.068e-04, loss_ctc=70.043, loss_att=55.047, acc=0.705, loss=59.515, grad_norm=4.554, loss_scale=1.000, learning_rate=7.970e-05, step_time=0.369 [2024-12-07 17:22:23,984] [INFO] [logging.py:129:log_dist] [Rank 0] step=378000, skipped=0, lr=[np.float64(7.968180188982044e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:22:23,985] [INFO] [timer.py:264:stop] epoch=0/micro_step=123000/global_step=123000, RunningAvgSamplesPerSec=43.79941044025071, CurrSamplesPerSec=46.18647446574825, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 17:22:23,987 (deepspeed_trainer:228) INFO: 26epoch:train:2901-3000batch: iter_time=1.082e-04, loss_ctc=75.026, loss_att=56.940, acc=0.721, loss=62.368, grad_norm=4.657, loss_scale=1.000, learning_rate=7.969e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:23:01,327 (deepspeed_trainer:228) INFO: 26epoch:train:3001-3100batch: iter_time=1.084e-04, loss_ctc=75.749, loss_att=66.643, acc=0.693, loss=69.395, grad_norm=5.056, loss_scale=1.000, learning_rate=7.968e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 17:23:38,707 (deepspeed_trainer:228) INFO: 26epoch:train:3101-3200batch: iter_time=1.110e-04, loss_ctc=70.467, loss_att=54.716, acc=0.707, loss=59.457, grad_norm=4.855, loss_scale=1.000, learning_rate=7.967e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 17:24:15,538 (deepspeed_trainer:228) INFO: 26epoch:train:3201-3300batch: iter_time=1.103e-04, loss_ctc=65.426, loss_att=47.652, acc=0.719, loss=52.968, grad_norm=4.572, loss_scale=1.000, learning_rate=7.966e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 17:24:52,584 (deepspeed_trainer:228) INFO: 26epoch:train:3301-3400batch: iter_time=1.086e-04, loss_ctc=71.273, loss_att=48.509, acc=0.712, loss=55.329, grad_norm=4.640, loss_scale=1.000, learning_rate=7.964e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:25:29,613 (deepspeed_trainer:228) INFO: 26epoch:train:3401-3500batch: iter_time=1.110e-04, loss_ctc=66.140, loss_att=47.446, acc=0.730, loss=53.085, grad_norm=3.920, loss_scale=1.000, learning_rate=7.963e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:26:07,192 (deepspeed_trainer:228) INFO: 26epoch:train:3501-3600batch: iter_time=1.109e-04, loss_ctc=72.482, loss_att=55.704, acc=0.700, loss=60.773, grad_norm=5.366, loss_scale=1.000, learning_rate=7.962e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:26:44,383 (deepspeed_trainer:228) INFO: 26epoch:train:3601-3700batch: iter_time=1.073e-04, loss_ctc=66.013, loss_att=48.758, acc=0.717, loss=53.940, grad_norm=4.817, loss_scale=1.000, learning_rate=7.961e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 17:27:07,145 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 17:27:33,973 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 17:27:49,846 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 17:27:49,846 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 17:27:49,848 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 17:28:15,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:15,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:15,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:17,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:17,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:18,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:18,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:18,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:18,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:19,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:19,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:19,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:19,677] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:19,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:20,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:28:22,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:02,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:02,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:03,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:05,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:05,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:06,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:06,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:07,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:07,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:07,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:07,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:07,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:08,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:08,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:08,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:11,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:49,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:50,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:50,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:54,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:54,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:54,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:54,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:55,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:55,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:55,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:56,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:56,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:56,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:57,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:29:59,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:00,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:35,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:36,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:37,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:41,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:42,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:42,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:42,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:43,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:43,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:44,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:45,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:45,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:45,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:46,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:48,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:30:49,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 17:31:34,143 (deepspeed_trainer:228) INFO: 26epoch:train:3701-3800batch: iter_time=2.472, loss_ctc=76.199, loss_att=52.373, acc=0.707, loss=59.549, grad_norm=6.802, loss_scale=1.000, learning_rate=7.960e-05, step_time=0.426 [cnode7-012:0/16] 2024-12-07 17:32:11,848 (deepspeed_trainer:228) INFO: 26epoch:train:3801-3900batch: iter_time=1.134e-04, loss_ctc=76.513, loss_att=59.978, acc=0.713, loss=64.931, grad_norm=4.860, loss_scale=1.000, learning_rate=7.959e-05, step_time=0.377 [2024-12-07 17:32:49,679] [INFO] [logging.py:129:log_dist] [Rank 0] step=379000, skipped=0, lr=[np.float64(7.957661162414598e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:32:49,679] [INFO] [timer.py:264:stop] epoch=0/micro_step=124000/global_step=124000, RunningAvgSamplesPerSec=43.79550069956156, CurrSamplesPerSec=42.32704699365333, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 17:32:49,681 (deepspeed_trainer:228) INFO: 26epoch:train:3901-4000batch: iter_time=1.091e-04, loss_ctc=68.072, loss_att=56.097, acc=0.706, loss=59.712, grad_norm=4.685, loss_scale=1.000, learning_rate=7.958e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 17:33:27,577 (deepspeed_trainer:228) INFO: 26epoch:train:4001-4100batch: iter_time=1.112e-04, loss_ctc=79.488, loss_att=59.973, acc=0.707, loss=65.831, grad_norm=5.283, loss_scale=1.000, learning_rate=7.957e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-07 17:34:05,065 (deepspeed_trainer:228) INFO: 26epoch:train:4101-4200batch: iter_time=1.092e-04, loss_ctc=70.882, loss_att=51.249, acc=0.713, loss=57.154, grad_norm=5.356, loss_scale=1.000, learning_rate=7.956e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 17:34:42,633 (deepspeed_trainer:228) INFO: 26epoch:train:4201-4300batch: iter_time=1.069e-04, loss_ctc=74.119, loss_att=54.722, acc=0.725, loss=60.538, grad_norm=4.555, loss_scale=1.000, learning_rate=7.955e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 17:35:20,237 (deepspeed_trainer:228) INFO: 26epoch:train:4301-4400batch: iter_time=1.113e-04, loss_ctc=69.493, loss_att=51.719, acc=0.723, loss=57.096, grad_norm=4.587, loss_scale=1.000, learning_rate=7.954e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:35:57,917 (deepspeed_trainer:228) INFO: 26epoch:train:4401-4500batch: iter_time=1.135e-04, loss_ctc=72.121, loss_att=57.216, acc=0.711, loss=61.675, grad_norm=4.839, loss_scale=1.000, learning_rate=7.953e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:36:35,376 (deepspeed_trainer:228) INFO: 26epoch:train:4501-4600batch: iter_time=1.130e-04, loss_ctc=75.850, loss_att=54.683, acc=0.721, loss=61.036, grad_norm=5.650, loss_scale=1.000, learning_rate=7.952e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 17:37:12,384 (deepspeed_trainer:228) INFO: 26epoch:train:4601-4700batch: iter_time=1.100e-04, loss_ctc=65.823, loss_att=50.822, acc=0.706, loss=55.323, grad_norm=5.408, loss_scale=1.000, learning_rate=7.951e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:37:50,163 (deepspeed_trainer:228) INFO: 26epoch:train:4701-4800batch: iter_time=1.105e-04, loss_ctc=72.367, loss_att=56.689, acc=0.717, loss=61.406, grad_norm=4.804, loss_scale=1.000, learning_rate=7.950e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 17:38:27,392 (deepspeed_trainer:228) INFO: 26epoch:train:4801-4900batch: iter_time=1.085e-04, loss_ctc=83.422, loss_att=68.754, acc=0.713, loss=73.147, grad_norm=5.107, loss_scale=1.000, learning_rate=7.949e-05, step_time=0.372 [2024-12-07 17:39:04,002] [INFO] [logging.py:129:log_dist] [Rank 0] step=380000, skipped=0, lr=[np.float64(7.947183685576504e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:39:04,003] [INFO] [timer.py:264:stop] epoch=0/micro_step=125000/global_step=125000, RunningAvgSamplesPerSec=43.79568762055591, CurrSamplesPerSec=46.47932979477707, MemAllocated=2.04GB, MaxMemAllocated=19.27GB [cnode7-012:0/16] 2024-12-07 17:39:04,004 (deepspeed_trainer:228) INFO: 26epoch:train:4901-5000batch: iter_time=1.087e-04, loss_ctc=65.812, loss_att=53.891, acc=0.707, loss=57.469, grad_norm=4.719, loss_scale=1.000, learning_rate=7.948e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 17:39:41,178 (deepspeed_trainer:228) INFO: 26epoch:train:5001-5100batch: iter_time=1.091e-04, loss_ctc=72.219, loss_att=54.640, acc=0.715, loss=59.916, grad_norm=4.803, loss_scale=1.000, learning_rate=7.947e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 17:40:17,976 (deepspeed_trainer:228) INFO: 26epoch:train:5101-5200batch: iter_time=1.129e-04, loss_ctc=66.226, loss_att=48.675, acc=0.717, loss=53.941, grad_norm=4.954, loss_scale=1.000, learning_rate=7.946e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 17:40:54,897 (deepspeed_trainer:228) INFO: 26epoch:train:5201-5300batch: iter_time=1.100e-04, loss_ctc=70.728, loss_att=46.728, acc=0.725, loss=53.958, grad_norm=4.656, loss_scale=1.000, learning_rate=7.945e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 17:41:31,646 (deepspeed_trainer:228) INFO: 26epoch:train:5301-5400batch: iter_time=1.078e-04, loss_ctc=65.252, loss_att=53.482, acc=0.721, loss=57.007, grad_norm=4.380, loss_scale=1.000, learning_rate=7.944e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 17:42:08,229 (deepspeed_trainer:228) INFO: 26epoch:train:5401-5500batch: iter_time=1.101e-04, loss_ctc=69.886, loss_att=48.001, acc=0.715, loss=54.578, grad_norm=5.034, loss_scale=1.000, learning_rate=7.942e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 17:42:44,793 (deepspeed_trainer:228) INFO: 26epoch:train:5501-5600batch: iter_time=1.103e-04, loss_ctc=69.269, loss_att=52.946, acc=0.715, loss=57.842, grad_norm=5.949, loss_scale=1.000, learning_rate=7.941e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-07 17:42:58,378 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 17:43:24,441 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 17:43:41,285 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 17:43:41,285 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 17:43:41,287 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 17:44:05,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:05,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:05,901] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:07,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:08,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:08,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:08,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:09,084] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:09,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:09,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:09,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:10,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:11,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:11,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:11,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:11,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:52,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:53,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:53,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:56,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:56,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:57,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:57,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:57,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:58,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:58,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:44:59,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:00,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:00,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:00,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:00,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:03,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:40,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:41,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:41,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:44,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:44,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:45,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:46,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:46,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:46,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:46,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:47,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:47,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:48,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:49,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:50,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:45:52,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:27,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:27,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:28,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:31,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:33,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:33,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:33,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:34,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:35,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:35,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:35,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:37,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:37,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:38,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:39,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:46:40,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 17:47:35,571 (deepspeed_trainer:228) INFO: 26epoch:train:5601-5700batch: iter_time=2.504, loss_ctc=74.511, loss_att=49.200, acc=0.718, loss=56.809, grad_norm=5.410, loss_scale=1.000, learning_rate=7.940e-05, step_time=0.404 [cnode7-012:0/16] 2024-12-07 17:48:13,293 (deepspeed_trainer:228) INFO: 26epoch:train:5701-5800batch: iter_time=1.042e-04, loss_ctc=71.169, loss_att=57.488, acc=0.710, loss=61.563, grad_norm=5.008, loss_scale=1.000, learning_rate=7.939e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 17:48:50,946 (deepspeed_trainer:228) INFO: 26epoch:train:5801-5900batch: iter_time=1.063e-04, loss_ctc=70.839, loss_att=55.547, acc=0.715, loss=60.130, grad_norm=5.012, loss_scale=1.000, learning_rate=7.938e-05, step_time=0.376 [2024-12-07 17:49:28,653] [INFO] [logging.py:129:log_dist] [Rank 0] step=381000, skipped=0, lr=[np.float64(7.93674748565317e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:49:28,654] [INFO] [timer.py:264:stop] epoch=0/micro_step=126000/global_step=126000, RunningAvgSamplesPerSec=43.79596423245343, CurrSamplesPerSec=43.20908803787821, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 17:49:28,655 (deepspeed_trainer:228) INFO: 26epoch:train:5901-6000batch: iter_time=1.067e-04, loss_ctc=79.464, loss_att=59.726, acc=0.705, loss=65.683, grad_norm=5.360, loss_scale=1.000, learning_rate=7.937e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 17:50:05,738 (deepspeed_trainer:228) INFO: 26epoch:train:6001-6100batch: iter_time=1.099e-04, loss_ctc=68.912, loss_att=48.546, acc=0.721, loss=54.673, grad_norm=4.987, loss_scale=1.000, learning_rate=7.936e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:50:43,426 (deepspeed_trainer:228) INFO: 26epoch:train:6101-6200batch: iter_time=1.090e-04, loss_ctc=74.900, loss_att=55.830, acc=0.724, loss=61.544, grad_norm=4.544, loss_scale=1.000, learning_rate=7.935e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 17:51:20,604 (deepspeed_trainer:228) INFO: 26epoch:train:6201-6300batch: iter_time=1.065e-04, loss_ctc=69.866, loss_att=51.870, acc=0.725, loss=57.281, grad_norm=4.837, loss_scale=1.000, learning_rate=7.934e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 17:51:57,762 (deepspeed_trainer:228) INFO: 26epoch:train:6301-6400batch: iter_time=1.062e-04, loss_ctc=69.622, loss_att=57.231, acc=0.709, loss=60.958, grad_norm=4.470, loss_scale=1.000, learning_rate=7.933e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 17:52:34,946 (deepspeed_trainer:228) INFO: 26epoch:train:6401-6500batch: iter_time=1.081e-04, loss_ctc=75.646, loss_att=55.026, acc=0.726, loss=61.229, grad_norm=5.742, loss_scale=1.000, learning_rate=7.932e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 17:53:12,030 (deepspeed_trainer:228) INFO: 26epoch:train:6501-6600batch: iter_time=1.111e-04, loss_ctc=65.026, loss_att=48.656, acc=0.711, loss=53.561, grad_norm=5.166, loss_scale=1.000, learning_rate=7.931e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 17:53:49,731 (deepspeed_trainer:228) INFO: 26epoch:train:6601-6700batch: iter_time=1.179e-04, loss_ctc=74.337, loss_att=58.442, acc=0.719, loss=63.218, grad_norm=4.640, loss_scale=1.000, learning_rate=7.930e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:54:27,310 (deepspeed_trainer:228) INFO: 26epoch:train:6701-6800batch: iter_time=1.132e-04, loss_ctc=81.795, loss_att=70.409, acc=0.707, loss=73.817, grad_norm=5.066, loss_scale=1.000, learning_rate=7.929e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 17:55:04,103 (deepspeed_trainer:228) INFO: 26epoch:train:6801-6900batch: iter_time=1.121e-04, loss_ctc=66.588, loss_att=50.981, acc=0.713, loss=55.685, grad_norm=4.889, loss_scale=1.000, learning_rate=7.928e-05, step_time=0.367 [2024-12-07 17:55:41,089] [INFO] [logging.py:129:log_dist] [Rank 0] step=382000, skipped=0, lr=[np.float64(7.926352292331251e-05)], mom=[[0.9, 0.98]] [2024-12-07 17:55:41,090] [INFO] [timer.py:264:stop] epoch=0/micro_step=127000/global_step=127000, RunningAvgSamplesPerSec=43.79790443909188, CurrSamplesPerSec=45.40545590435821, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 17:55:41,091 (deepspeed_trainer:228) INFO: 26epoch:train:6901-7000batch: iter_time=1.076e-04, loss_ctc=70.892, loss_att=54.028, acc=0.717, loss=59.081, grad_norm=4.524, loss_scale=1.000, learning_rate=7.927e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:56:17,556 (deepspeed_trainer:228) INFO: 26epoch:train:7001-7100batch: iter_time=1.097e-04, loss_ctc=63.679, loss_att=46.879, acc=0.711, loss=51.901, grad_norm=4.757, loss_scale=1.000, learning_rate=7.926e-05, step_time=0.364 [cnode7-012:0/16] 2024-12-07 17:56:54,544 (deepspeed_trainer:228) INFO: 26epoch:train:7101-7200batch: iter_time=1.129e-04, loss_ctc=70.589, loss_att=46.583, acc=0.732, loss=53.781, grad_norm=4.815, loss_scale=1.000, learning_rate=7.925e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 17:57:31,527 (deepspeed_trainer:228) INFO: 26epoch:train:7201-7300batch: iter_time=1.109e-04, loss_ctc=69.361, loss_att=57.103, acc=0.720, loss=60.769, grad_norm=4.796, loss_scale=1.000, learning_rate=7.924e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 17:58:08,334 (deepspeed_trainer:228) INFO: 26epoch:train:7301-7400batch: iter_time=1.098e-04, loss_ctc=68.103, loss_att=48.308, acc=0.720, loss=54.232, grad_norm=4.966, loss_scale=1.000, learning_rate=7.923e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 17:58:45,137 (deepspeed_trainer:228) INFO: 26epoch:train:7401-7500batch: iter_time=1.085e-04, loss_ctc=73.956, loss_att=55.792, acc=0.701, loss=61.235, grad_norm=6.140, loss_scale=1.000, learning_rate=7.922e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 17:58:49,637 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 17:59:17,345 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 17:59:33,298 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 17:59:33,299 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 17:59:33,301 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 17:59:57,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:59:57,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:59:58,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:59:59,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:59:59,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 17:59:59,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:00,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:00,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:00,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:01,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:01,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:01,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:02,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:02,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:03,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:03,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:45,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:46,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:47,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:48,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:48,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:49,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:48,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:49,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:49,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:49,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:49,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:51,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:52,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:53,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:53,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:00:53,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:32,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:32,901] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:35,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:36,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:36,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:37,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:37,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:37,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:38,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:38,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:38,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:39,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:41,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:41,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:42,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:01:42,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:19,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:20,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:23,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:24,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:24,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:25,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:25,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:25,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:26,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:26,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:27,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:29,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:30,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:30,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:30,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:02:31,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 18:03:35,679 (deepspeed_trainer:228) INFO: 26epoch:train:7501-7600batch: iter_time=2.484, loss_ctc=68.981, loss_att=48.244, acc=0.721, loss=54.471, grad_norm=5.097, loss_scale=1.000, learning_rate=7.921e-05, step_time=0.421 [cnode7-012:0/16] 2024-12-07 18:04:13,272 (deepspeed_trainer:228) INFO: 26epoch:train:7601-7700batch: iter_time=1.083e-04, loss_ctc=74.270, loss_att=60.975, acc=0.705, loss=64.957, grad_norm=4.629, loss_scale=1.000, learning_rate=7.920e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-07 18:04:50,474 (deepspeed_trainer:228) INFO: 26epoch:train:7701-7800batch: iter_time=1.134e-04, loss_ctc=69.812, loss_att=52.292, acc=0.706, loss=57.556, grad_norm=5.182, loss_scale=1.000, learning_rate=7.919e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 18:05:27,743 (deepspeed_trainer:228) INFO: 26epoch:train:7801-7900batch: iter_time=1.115e-04, loss_ctc=75.233, loss_att=59.119, acc=0.702, loss=63.925, grad_norm=5.347, loss_scale=1.000, learning_rate=7.918e-05, step_time=0.372 [2024-12-07 18:06:04,931] [INFO] [logging.py:129:log_dist] [Rank 0] step=383000, skipped=0, lr=[np.float64(7.915997837769233e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:06:04,932] [INFO] [timer.py:264:stop] epoch=0/micro_step=128000/global_step=128000, RunningAvgSamplesPerSec=43.797002646483534, CurrSamplesPerSec=41.83246648189532, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:06:04,934 (deepspeed_trainer:228) INFO: 26epoch:train:7901-8000batch: iter_time=1.173e-04, loss_ctc=72.491, loss_att=50.747, acc=0.716, loss=57.269, grad_norm=4.976, loss_scale=1.000, learning_rate=7.917e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:06:42,135 (deepspeed_trainer:228) INFO: 26epoch:train:8001-8100batch: iter_time=1.132e-04, loss_ctc=72.727, loss_att=53.850, acc=0.718, loss=59.532, grad_norm=4.711, loss_scale=1.000, learning_rate=7.915e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 18:07:19,281 (deepspeed_trainer:228) INFO: 26epoch:train:8101-8200batch: iter_time=1.130e-04, loss_ctc=72.280, loss_att=53.346, acc=0.724, loss=59.034, grad_norm=4.532, loss_scale=1.000, learning_rate=7.914e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:07:56,459 (deepspeed_trainer:228) INFO: 26epoch:train:8201-8300batch: iter_time=1.133e-04, loss_ctc=69.561, loss_att=55.200, acc=0.707, loss=59.494, grad_norm=4.920, loss_scale=1.000, learning_rate=7.913e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 18:08:33,628 (deepspeed_trainer:228) INFO: 26epoch:train:8301-8400batch: iter_time=1.128e-04, loss_ctc=76.697, loss_att=54.073, acc=0.721, loss=60.856, grad_norm=5.704, loss_scale=1.000, learning_rate=7.912e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:09:10,641 (deepspeed_trainer:228) INFO: 26epoch:train:8401-8500batch: iter_time=1.115e-04, loss_ctc=64.786, loss_att=49.360, acc=0.704, loss=53.989, grad_norm=4.629, loss_scale=1.000, learning_rate=7.911e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 18:09:47,766 (deepspeed_trainer:228) INFO: 26epoch:train:8501-8600batch: iter_time=1.121e-04, loss_ctc=73.278, loss_att=57.869, acc=0.723, loss=62.468, grad_norm=4.500, loss_scale=1.000, learning_rate=7.910e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:10:25,107 (deepspeed_trainer:228) INFO: 26epoch:train:8601-8700batch: iter_time=1.129e-04, loss_ctc=78.428, loss_att=66.614, acc=0.701, loss=70.136, grad_norm=5.296, loss_scale=1.000, learning_rate=7.909e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 18:11:02,022 (deepspeed_trainer:228) INFO: 26epoch:train:8701-8800batch: iter_time=1.108e-04, loss_ctc=67.583, loss_att=50.782, acc=0.710, loss=55.823, grad_norm=4.961, loss_scale=1.000, learning_rate=7.908e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 18:11:38,964 (deepspeed_trainer:228) INFO: 26epoch:train:8801-8900batch: iter_time=1.117e-04, loss_ctc=68.164, loss_att=53.081, acc=0.717, loss=57.588, grad_norm=5.221, loss_scale=1.000, learning_rate=7.907e-05, step_time=0.369 [2024-12-07 18:12:15,827] [INFO] [logging.py:129:log_dist] [Rank 0] step=384000, skipped=0, lr=[np.float64(7.905683856568462e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:12:15,828] [INFO] [timer.py:264:stop] epoch=0/micro_step=129000/global_step=129000, RunningAvgSamplesPerSec=43.80050289759693, CurrSamplesPerSec=42.30978133523666, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:12:15,829 (deepspeed_trainer:228) INFO: 26epoch:train:8901-9000batch: iter_time=1.113e-04, loss_ctc=66.883, loss_att=46.564, acc=0.718, loss=52.653, grad_norm=5.036, loss_scale=1.000, learning_rate=7.906e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 18:12:52,771 (deepspeed_trainer:228) INFO: 26epoch:train:9001-9100batch: iter_time=1.150e-04, loss_ctc=69.482, loss_att=46.120, acc=0.729, loss=53.133, grad_norm=4.411, loss_scale=1.000, learning_rate=7.905e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 18:13:29,851 (deepspeed_trainer:228) INFO: 26epoch:train:9101-9200batch: iter_time=1.141e-04, loss_ctc=68.471, loss_att=55.986, acc=0.713, loss=59.728, grad_norm=4.618, loss_scale=1.000, learning_rate=7.904e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:14:06,627 (deepspeed_trainer:228) INFO: 26epoch:train:9201-9300batch: iter_time=1.128e-04, loss_ctc=66.623, loss_att=46.369, acc=0.715, loss=52.438, grad_norm=4.867, loss_scale=1.000, learning_rate=7.903e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 18:14:39,391 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 18:15:06,171 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 18:15:23,413 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 18:15:23,413 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 18:15:23,415 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 18:15:46,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:47,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:47,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:47,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:48,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:47,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:48,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:49,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:49,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:49,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:49,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:49,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:51,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:51,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:52,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:15:52,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:35,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:35,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:35,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:36,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:36,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:37,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:37,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:37,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:37,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:38,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:39,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:40,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:40,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:41,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:41,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:16:44,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:22,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:24,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:24,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:24,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:25,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:25,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:26,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:26,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:26,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:26,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:28,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:29,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:29,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:32,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:33,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:17:33,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:10,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:11,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:11,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:11,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:12,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:13,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:13,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:14,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:14,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:15,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:17,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:17,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:18,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:21,691] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:22,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:18:22,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 18:18:59,002 (deepspeed_trainer:228) INFO: 26epoch:train:9301-9400batch: iter_time=2.548, loss_ctc=80.624, loss_att=59.188, acc=0.702, loss=65.666, grad_norm=6.280, loss_scale=1.000, learning_rate=7.902e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 18:19:36,832 (deepspeed_trainer:228) INFO: 26epoch:train:9401-9500batch: iter_time=1.135e-04, loss_ctc=67.960, loss_att=48.793, acc=0.724, loss=54.572, grad_norm=4.408, loss_scale=1.000, learning_rate=7.901e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 18:20:14,657 (deepspeed_trainer:228) INFO: 26epoch:train:9501-9600batch: iter_time=1.146e-04, loss_ctc=71.983, loss_att=58.888, acc=0.706, loss=62.831, grad_norm=4.601, loss_scale=1.000, learning_rate=7.900e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 18:20:52,339 (deepspeed_trainer:228) INFO: 26epoch:train:9601-9700batch: iter_time=1.147e-04, loss_ctc=69.262, loss_att=52.307, acc=0.707, loss=57.408, grad_norm=4.822, loss_scale=1.000, learning_rate=7.899e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 18:21:30,011 (deepspeed_trainer:228) INFO: 26epoch:train:9701-9800batch: iter_time=1.083e-04, loss_ctc=75.874, loss_att=57.390, acc=0.702, loss=62.930, grad_norm=5.473, loss_scale=1.000, learning_rate=7.898e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-07 18:22:07,003 (deepspeed_trainer:228) INFO: 26epoch:train:9801-9900batch: iter_time=1.095e-04, loss_ctc=69.941, loss_att=50.896, acc=0.724, loss=56.630, grad_norm=4.755, loss_scale=1.000, learning_rate=7.897e-05, step_time=0.369 [2024-12-07 18:22:43,898] [INFO] [logging.py:129:log_dist] [Rank 0] step=385000, skipped=0, lr=[np.float64(7.895410085744554e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:22:43,899] [INFO] [timer.py:264:stop] epoch=0/micro_step=130000/global_step=130000, RunningAvgSamplesPerSec=43.801740582485564, CurrSamplesPerSec=42.192736766389636, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:22:43,901 (deepspeed_trainer:228) INFO: 26epoch:train:9901-10000batch: iter_time=1.123e-04, loss_ctc=73.379, loss_att=54.089, acc=0.718, loss=59.878, grad_norm=4.553, loss_scale=1.000, learning_rate=7.896e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 18:23:21,314 (deepspeed_trainer:228) INFO: 26epoch:train:10001-10100batch: iter_time=1.117e-04, loss_ctc=70.413, loss_att=53.897, acc=0.726, loss=58.892, grad_norm=4.518, loss_scale=1.000, learning_rate=7.895e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:23:58,459 (deepspeed_trainer:228) INFO: 26epoch:train:10101-10200batch: iter_time=1.109e-04, loss_ctc=72.589, loss_att=54.388, acc=0.708, loss=59.853, grad_norm=5.715, loss_scale=1.000, learning_rate=7.894e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:24:35,530 (deepspeed_trainer:228) INFO: 26epoch:train:10201-10300batch: iter_time=1.116e-04, loss_ctc=69.120, loss_att=49.885, acc=0.712, loss=55.630, grad_norm=5.015, loss_scale=1.000, learning_rate=7.893e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 18:25:12,821 (deepspeed_trainer:228) INFO: 26epoch:train:10301-10400batch: iter_time=1.109e-04, loss_ctc=69.811, loss_att=54.766, acc=0.708, loss=59.263, grad_norm=4.554, loss_scale=1.000, learning_rate=7.892e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 18:25:49,925 (deepspeed_trainer:228) INFO: 26epoch:train:10401-10500batch: iter_time=1.090e-04, loss_ctc=74.173, loss_att=56.376, acc=0.724, loss=61.673, grad_norm=4.678, loss_scale=1.000, learning_rate=7.891e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:26:26,887 (deepspeed_trainer:228) INFO: 26epoch:train:10501-10600batch: iter_time=1.138e-04, loss_ctc=74.674, loss_att=66.628, acc=0.695, loss=69.050, grad_norm=5.311, loss_scale=1.000, learning_rate=7.890e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 18:27:03,582 (deepspeed_trainer:228) INFO: 26epoch:train:10601-10700batch: iter_time=1.110e-04, loss_ctc=69.450, loss_att=54.157, acc=0.710, loss=58.742, grad_norm=4.835, loss_scale=1.000, learning_rate=7.889e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 18:27:40,343 (deepspeed_trainer:228) INFO: 26epoch:train:10701-10800batch: iter_time=1.074e-04, loss_ctc=64.665, loss_att=46.722, acc=0.724, loss=52.127, grad_norm=4.684, loss_scale=1.000, learning_rate=7.888e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 18:28:16,978 (deepspeed_trainer:228) INFO: 26epoch:train:10801-10900batch: iter_time=1.085e-04, loss_ctc=70.594, loss_att=47.380, acc=0.716, loss=54.333, grad_norm=4.785, loss_scale=1.000, learning_rate=7.887e-05, step_time=0.366 [2024-12-07 18:28:53,564] [INFO] [logging.py:129:log_dist] [Rank 0] step=386000, skipped=0, lr=[np.float64(7.885176264699247e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:28:53,564] [INFO] [timer.py:264:stop] epoch=0/micro_step=131000/global_step=131000, RunningAvgSamplesPerSec=43.80630950522815, CurrSamplesPerSec=45.90001756517352, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:28:53,566 (deepspeed_trainer:228) INFO: 26epoch:train:10901-11000batch: iter_time=1.122e-04, loss_ctc=65.445, loss_att=46.526, acc=0.733, loss=52.209, grad_norm=4.058, loss_scale=1.000, learning_rate=7.886e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-07 18:29:30,531 (deepspeed_trainer:228) INFO: 26epoch:train:11001-11100batch: iter_time=1.143e-04, loss_ctc=71.135, loss_att=54.989, acc=0.703, loss=59.803, grad_norm=5.296, loss_scale=1.000, learning_rate=7.885e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 18:30:07,695 (deepspeed_trainer:228) INFO: 26epoch:train:11101-11200batch: iter_time=1.103e-04, loss_ctc=65.475, loss_att=48.373, acc=0.720, loss=53.501, grad_norm=4.842, loss_scale=1.000, learning_rate=7.884e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:30:31,295 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 18:30:58,837 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 18:31:15,727 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 18:31:15,728 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 18:31:15,730 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 18:31:38,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:39,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:40,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:40,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:40,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:41,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:41,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:41,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:41,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:41,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:42,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:42,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:43,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:44,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:44,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:31:44,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:26,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:27,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:28,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:28,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:29,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:29,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:29,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:29,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:29,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:30,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:30,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:33,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:33,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:33,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:33,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:32:36,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:13,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:14,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:16,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:16,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:16,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:17,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:18,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:18,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:19,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:19,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:19,231] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:22,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:22,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:23,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:23,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:33:26,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:00,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:00,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:03,691] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:03,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:04,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:05,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:05,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:06,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:06,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:07,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:07,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:10,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:11,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:11,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:11,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:34:15,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 18:34:59,833 (deepspeed_trainer:228) INFO: 26epoch:train:11201-11300batch: iter_time=2.518, loss_ctc=73.995, loss_att=51.297, acc=0.713, loss=58.101, grad_norm=5.818, loss_scale=1.000, learning_rate=7.883e-05, step_time=0.403 [cnode7-012:0/16] 2024-12-07 18:35:36,728 (deepspeed_trainer:228) INFO: 26epoch:train:11301-11400batch: iter_time=1.065e-04, loss_ctc=75.475, loss_att=57.907, acc=0.714, loss=63.174, grad_norm=4.782, loss_scale=1.000, learning_rate=7.882e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 18:36:13,716 (deepspeed_trainer:228) INFO: 26epoch:train:11401-11500batch: iter_time=1.122e-04, loss_ctc=66.480, loss_att=53.926, acc=0.706, loss=57.690, grad_norm=4.570, loss_scale=1.000, learning_rate=7.881e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 18:36:50,775 (deepspeed_trainer:228) INFO: 26epoch:train:11501-11600batch: iter_time=1.102e-04, loss_ctc=77.990, loss_att=58.183, acc=0.704, loss=64.140, grad_norm=5.376, loss_scale=1.000, learning_rate=7.880e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 18:37:27,483 (deepspeed_trainer:228) INFO: 26epoch:train:11601-11700batch: iter_time=1.092e-04, loss_ctc=69.861, loss_att=50.206, acc=0.715, loss=56.094, grad_norm=5.050, loss_scale=1.000, learning_rate=7.879e-05, step_time=0.367 [cnode7-012:0/16] 2024-12-07 18:38:04,966 (deepspeed_trainer:228) INFO: 26epoch:train:11701-11800batch: iter_time=1.090e-04, loss_ctc=73.267, loss_att=53.229, acc=0.725, loss=59.230, grad_norm=4.677, loss_scale=1.000, learning_rate=7.878e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:38:42,014 (deepspeed_trainer:228) INFO: 26epoch:train:11801-11900batch: iter_time=1.083e-04, loss_ctc=68.327, loss_att=50.316, acc=0.723, loss=55.699, grad_norm=4.637, loss_scale=1.000, learning_rate=7.877e-05, step_time=0.370 [2024-12-07 18:39:19,254] [INFO] [logging.py:129:log_dist] [Rank 0] step=387000, skipped=0, lr=[np.float64(7.874982135192627e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:39:19,255] [INFO] [timer.py:264:stop] epoch=0/micro_step=132000/global_step=132000, RunningAvgSamplesPerSec=43.80680636489806, CurrSamplesPerSec=43.8457483276431, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:39:19,256 (deepspeed_trainer:228) INFO: 26epoch:train:11901-12000batch: iter_time=1.086e-04, loss_ctc=71.776, loss_att=55.657, acc=0.712, loss=60.484, grad_norm=5.304, loss_scale=1.000, learning_rate=7.875e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 18:39:56,677 (deepspeed_trainer:228) INFO: 26epoch:train:12001-12100batch: iter_time=1.106e-04, loss_ctc=74.436, loss_att=53.724, acc=0.724, loss=59.930, grad_norm=5.360, loss_scale=1.000, learning_rate=7.874e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:40:33,742 (deepspeed_trainer:228) INFO: 26epoch:train:12101-12200batch: iter_time=1.100e-04, loss_ctc=65.350, loss_att=48.601, acc=0.710, loss=53.611, grad_norm=5.254, loss_scale=1.000, learning_rate=7.873e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:41:11,295 (deepspeed_trainer:228) INFO: 26epoch:train:12201-12300batch: iter_time=1.096e-04, loss_ctc=71.315, loss_att=56.512, acc=0.714, loss=60.936, grad_norm=4.472, loss_scale=1.000, learning_rate=7.872e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 18:41:48,677 (deepspeed_trainer:228) INFO: 26epoch:train:12301-12400batch: iter_time=1.120e-04, loss_ctc=82.650, loss_att=67.833, acc=0.709, loss=72.255, grad_norm=5.112, loss_scale=1.000, learning_rate=7.871e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:42:25,826 (deepspeed_trainer:228) INFO: 26epoch:train:12401-12500batch: iter_time=1.089e-04, loss_ctc=64.397, loss_att=53.665, acc=0.707, loss=56.844, grad_norm=4.940, loss_scale=1.000, learning_rate=7.870e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:43:03,222 (deepspeed_trainer:228) INFO: 26epoch:train:12501-12600batch: iter_time=1.088e-04, loss_ctc=71.469, loss_att=54.207, acc=0.714, loss=59.362, grad_norm=4.576, loss_scale=1.000, learning_rate=7.869e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:43:40,590 (deepspeed_trainer:228) INFO: 26epoch:train:12601-12700batch: iter_time=1.076e-04, loss_ctc=65.611, loss_att=47.673, acc=0.721, loss=53.049, grad_norm=4.865, loss_scale=1.000, learning_rate=7.868e-05, step_time=0.373 [cnode7-012:0/16] 2024-12-07 18:44:18,098 (deepspeed_trainer:228) INFO: 26epoch:train:12701-12800batch: iter_time=1.126e-04, loss_ctc=70.547, loss_att=44.713, acc=0.728, loss=52.471, grad_norm=4.287, loss_scale=1.000, learning_rate=7.867e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 18:44:55,428 (deepspeed_trainer:228) INFO: 26epoch:train:12801-12900batch: iter_time=1.086e-04, loss_ctc=65.034, loss_att=52.506, acc=0.720, loss=56.268, grad_norm=4.328, loss_scale=1.000, learning_rate=7.866e-05, step_time=0.373 [2024-12-07 18:45:33,765] [INFO] [logging.py:129:log_dist] [Rank 0] step=388000, skipped=0, lr=[np.float64(7.864827441315767e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:45:33,766] [INFO] [timer.py:264:stop] epoch=0/micro_step=133000/global_step=133000, RunningAvgSamplesPerSec=43.80684402853434, CurrSamplesPerSec=46.59803213230148, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:45:33,767 (deepspeed_trainer:228) INFO: 26epoch:train:12901-13000batch: iter_time=1.087e-04, loss_ctc=68.719, loss_att=48.366, acc=0.715, loss=54.495, grad_norm=5.403, loss_scale=1.000, learning_rate=7.865e-05, step_time=0.383 [cnode7-012:0/16] 2024-12-07 18:46:11,021 (deepspeed_trainer:228) INFO: 26epoch:train:13001-13100batch: iter_time=1.092e-04, loss_ctc=67.841, loss_att=52.676, acc=0.712, loss=57.214, grad_norm=5.774, loss_scale=1.000, learning_rate=7.864e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-07 18:46:24,784 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 18:46:51,877 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 18:47:07,959 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 18:47:07,959 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 18:47:07,961 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 18:47:32,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:32,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:32,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:34,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:35,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:35,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:35,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:35,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:35,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:36,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:36,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:36,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:36,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:36,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:37,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:47:37,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:19,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:20,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:20,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:23,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:23,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:24,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:25,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:26,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:26,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:48:26,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:06,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:08,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:08,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:11,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:11,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:12,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:13,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:14,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:14,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:14,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:14,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:14,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:15,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:15,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:15,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:17,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:54,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:56,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:56,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:58,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:49:59,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:00,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:01,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:01,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:02,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:02,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:02,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:03,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:04,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:04,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:04,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 18:50:07,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 18:51:00,916 (deepspeed_trainer:228) INFO: 26epoch:train:13101-13200batch: iter_time=2.499, loss_ctc=73.271, loss_att=49.570, acc=0.717, loss=56.673, grad_norm=5.247, loss_scale=1.000, learning_rate=7.863e-05, step_time=0.407 [cnode7-012:0/16] 2024-12-07 18:51:38,365 (deepspeed_trainer:228) INFO: 26epoch:train:13201-13300batch: iter_time=1.092e-04, loss_ctc=70.341, loss_att=57.611, acc=0.714, loss=61.410, grad_norm=4.994, loss_scale=1.000, learning_rate=7.862e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:52:15,840 (deepspeed_trainer:228) INFO: 26epoch:train:13301-13400batch: iter_time=1.101e-04, loss_ctc=70.151, loss_att=56.282, acc=0.716, loss=60.460, grad_norm=4.973, loss_scale=1.000, learning_rate=7.861e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:52:53,386 (deepspeed_trainer:228) INFO: 26epoch:train:13401-13500batch: iter_time=1.053e-04, loss_ctc=78.631, loss_att=60.048, acc=0.705, loss=65.630, grad_norm=5.695, loss_scale=1.000, learning_rate=7.860e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 18:53:30,483 (deepspeed_trainer:228) INFO: 26epoch:train:13501-13600batch: iter_time=1.101e-04, loss_ctc=67.628, loss_att=48.587, acc=0.725, loss=54.308, grad_norm=5.434, loss_scale=1.000, learning_rate=7.859e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:54:07,919 (deepspeed_trainer:228) INFO: 26epoch:train:13601-13700batch: iter_time=1.054e-04, loss_ctc=74.422, loss_att=56.159, acc=0.722, loss=61.664, grad_norm=4.940, loss_scale=1.000, learning_rate=7.858e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:54:45,034 (deepspeed_trainer:228) INFO: 26epoch:train:13701-13800batch: iter_time=1.059e-04, loss_ctc=69.088, loss_att=51.459, acc=0.726, loss=56.721, grad_norm=4.931, loss_scale=1.000, learning_rate=7.857e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 18:55:22,310 (deepspeed_trainer:228) INFO: 26epoch:train:13801-13900batch: iter_time=1.076e-04, loss_ctc=69.559, loss_att=57.323, acc=0.709, loss=61.012, grad_norm=4.808, loss_scale=1.000, learning_rate=7.856e-05, step_time=0.372 [2024-12-07 18:55:59,860] [INFO] [logging.py:129:log_dist] [Rank 0] step=389000, skipped=0, lr=[np.float64(7.854711929463735e-05)], mom=[[0.9, 0.98]] [2024-12-07 18:55:59,860] [INFO] [timer.py:264:stop] epoch=0/micro_step=134000/global_step=134000, RunningAvgSamplesPerSec=43.80466375078878, CurrSamplesPerSec=44.24303653320199, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 18:55:59,862 (deepspeed_trainer:228) INFO: 26epoch:train:13901-14000batch: iter_time=1.090e-04, loss_ctc=75.148, loss_att=54.730, acc=0.725, loss=60.883, grad_norm=5.789, loss_scale=1.000, learning_rate=7.855e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 18:56:36,907 (deepspeed_trainer:228) INFO: 26epoch:train:14001-14100batch: iter_time=1.061e-04, loss_ctc=64.624, loss_att=49.019, acc=0.712, loss=53.722, grad_norm=5.246, loss_scale=1.000, learning_rate=7.854e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 18:57:14,390 (deepspeed_trainer:228) INFO: 26epoch:train:14101-14200batch: iter_time=1.076e-04, loss_ctc=73.277, loss_att=57.833, acc=0.722, loss=62.460, grad_norm=4.293, loss_scale=1.000, learning_rate=7.853e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 18:57:52,175 (deepspeed_trainer:228) INFO: 26epoch:train:14201-14300batch: iter_time=1.107e-04, loss_ctc=81.182, loss_att=70.161, acc=0.707, loss=73.454, grad_norm=5.053, loss_scale=1.000, learning_rate=7.852e-05, step_time=0.378 [cnode7-012:0/16] 2024-12-07 18:58:29,266 (deepspeed_trainer:228) INFO: 26epoch:train:14301-14400batch: iter_time=1.071e-04, loss_ctc=66.236, loss_att=50.882, acc=0.713, loss=55.500, grad_norm=4.838, loss_scale=1.000, learning_rate=7.851e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 18:59:06,146 (deepspeed_trainer:228) INFO: 26epoch:train:14401-14500batch: iter_time=1.101e-04, loss_ctc=69.515, loss_att=53.336, acc=0.719, loss=58.194, grad_norm=5.152, loss_scale=1.000, learning_rate=7.850e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 18:59:42,594 (deepspeed_trainer:228) INFO: 26epoch:train:14501-14600batch: iter_time=1.066e-04, loss_ctc=63.799, loss_att=47.039, acc=0.711, loss=52.074, grad_norm=4.771, loss_scale=1.000, learning_rate=7.849e-05, step_time=0.364 [cnode7-012:0/16] 2024-12-07 19:00:19,490 (deepspeed_trainer:228) INFO: 26epoch:train:14601-14700batch: iter_time=1.081e-04, loss_ctc=69.836, loss_att=46.430, acc=0.734, loss=53.464, grad_norm=4.577, loss_scale=1.000, learning_rate=7.848e-05, step_time=0.369 [cnode7-012:0/16] 2024-12-07 19:00:56,505 (deepspeed_trainer:228) INFO: 26epoch:train:14701-14800batch: iter_time=1.104e-04, loss_ctc=68.542, loss_att=56.227, acc=0.723, loss=59.950, grad_norm=4.904, loss_scale=1.000, learning_rate=7.847e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-07 19:01:33,110 (deepspeed_trainer:228) INFO: 26epoch:train:14801-14900batch: iter_time=1.062e-04, loss_ctc=67.906, loss_att=48.397, acc=0.720, loss=54.226, grad_norm=5.328, loss_scale=1.000, learning_rate=7.846e-05, step_time=0.365 [2024-12-07 19:02:09,933] [INFO] [logging.py:129:log_dist] [Rank 0] step=390000, skipped=0, lr=[np.float64(7.844635348309004e-05)], mom=[[0.9, 0.98]] [2024-12-07 19:02:09,933] [INFO] [timer.py:264:stop] epoch=0/micro_step=135000/global_step=135000, RunningAvgSamplesPerSec=43.80854444424528, CurrSamplesPerSec=47.34414283546496, MemAllocated=2.04GB, MaxMemAllocated=23.93GB [cnode7-012:0/16] 2024-12-07 19:02:09,935 (deepspeed_trainer:228) INFO: 26epoch:train:14901-15000batch: iter_time=1.037e-04, loss_ctc=73.363, loss_att=55.604, acc=0.704, loss=60.932, grad_norm=6.379, loss_scale=1.000, learning_rate=7.845e-05, step_time=0.368 [2024-12-07 19:02:26,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:26,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:27,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:41,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:42,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:42,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:43,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:43,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:43,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:44,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:44,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:44,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:44,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:45,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:45,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:45,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:45,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:45,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:45,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:56,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:58,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:58,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:58,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:59,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:59,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:02:59,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:00,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:00,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:00,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:00,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:00,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:01,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:01,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:02,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:02,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:11,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:13,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:13,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:14,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:14,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:14,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:14,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:15,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:15,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:15,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:16,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:16,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:16,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:16,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:17,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:18,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:03:30,293] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 26 is about to be saved! [2024-12-07 19:03:30,328] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt [2024-12-07 19:03:30,329] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:03:32,241] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:03:32,398] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,548] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,400] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,548] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,548] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,550] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,550] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,550] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,402] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,402] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,403] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,403] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,405] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,406] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,554] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 19:03:32,554] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 19:03:33,192] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,198] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,198] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,198] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,201] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,202] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,208] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,208] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,208] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,224] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,224] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,224] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,268] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,268] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,268] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,274] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,274] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,274] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,294] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,294] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:33,315] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 19:03:33,315] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 19:03:33,315] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 19:03:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 19:03:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 26 is ready now! [cnode7-012:0/16] 2024-12-07 19:03:34,182 (deepspeed_trainer:158) INFO: 26epoch results: [train] iter_time=0.133, loss_ctc=71.363, loss_att=53.933, acc=0.714, loss=59.163, grad_norm=5.001, loss_scale=1.000, learning_rate=7.922e-05, step_time=0.374, time=2 hours, 6 minutes and 54.29 seconds, total_count=390026, gpu_max_cached_mem_GB=32.521, [valid] loss_ctc=4.500, cer_ctc=0.095, loss_att=9.500, acc=0.757, cer=0.468, wer=1.000, loss=8.000, time=1 minute and 10.27 seconds, total_count=26, gpu_max_cached_mem_GB=32.521 [cnode7-012:0/16] 2024-12-07 19:03:36,987 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 19:04:03,848 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 19:04:19,324 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 19:04:19,324 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 19:04:19,326 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 19:04:39,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:39,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:39,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:40,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:41,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:41,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:42,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:42,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:42,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:42,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:42,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:43,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:43,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:43,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:43,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:04:44,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:27,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:27,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:27,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:29,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:29,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:30,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:30,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:31,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:31,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:31,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:31,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:32,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:32,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:32,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:32,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:05:34,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:13,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:13,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:14,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:18,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:18,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:18,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:18,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:19,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:19,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:19,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:19,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:19,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:19,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:20,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:21,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:06:24,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:01,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:02,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:03,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:06,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:06,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:06,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:07,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:07,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:07,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:07,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:07,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:08,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:08,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:08,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:10,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:07:13,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 19:08:16,270 (deepspeed_trainer:228) INFO: 27epoch:train:1-100batch: iter_time=2.383, loss_ctc=69.752, loss_att=57.293, acc=0.703, loss=61.044, grad_norm=5.571, loss_scale=1.000, learning_rate=7.844e-05, step_time=0.412 [cnode7-012:0/16] 2024-12-07 19:08:53,433 (deepspeed_trainer:228) INFO: 27epoch:train:101-200batch: iter_time=1.067e-04, loss_ctc=77.757, loss_att=59.806, acc=0.701, loss=65.198, grad_norm=5.223, loss_scale=1.000, learning_rate=7.843e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 19:09:30,546 (deepspeed_trainer:228) INFO: 27epoch:train:201-300batch: iter_time=1.088e-04, loss_ctc=68.755, loss_att=50.117, acc=0.724, loss=55.719, grad_norm=4.979, loss_scale=1.000, learning_rate=7.842e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 19:10:07,717 (deepspeed_trainer:228) INFO: 27epoch:train:301-400batch: iter_time=1.094e-04, loss_ctc=72.069, loss_att=55.774, acc=0.698, loss=60.641, grad_norm=5.986, loss_scale=1.000, learning_rate=7.841e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 19:10:45,225 (deepspeed_trainer:228) INFO: 27epoch:train:401-500batch: iter_time=1.099e-04, loss_ctc=77.200, loss_att=56.347, acc=0.711, loss=62.618, grad_norm=5.256, loss_scale=1.000, learning_rate=7.840e-05, step_time=0.375 [cnode7-012:0/16] 2024-12-07 19:11:22,118 (deepspeed_trainer:228) INFO: 27epoch:train:501-600batch: iter_time=1.087e-04, loss_ctc=71.996, loss_att=49.538, acc=0.715, loss=56.287, grad_norm=4.858, loss_scale=1.000, learning_rate=7.839e-05, step_time=0.368 [cnode7-012:0/16] 2024-12-07 19:11:59,269 (deepspeed_trainer:228) INFO: 27epoch:train:601-700batch: iter_time=1.081e-04, loss_ctc=76.186, loss_att=58.975, acc=0.705, loss=64.134, grad_norm=5.957, loss_scale=1.000, learning_rate=7.838e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-07 19:12:36,385 (deepspeed_trainer:228) INFO: 27epoch:train:701-800batch: iter_time=1.123e-04, loss_ctc=76.702, loss_att=58.922, acc=0.703, loss=64.216, grad_norm=5.887, loss_scale=1.000, learning_rate=7.837e-05, step_time=0.370 srun: Job step aborted: Waiting up to 32 seconds for job step to finish. slurmstepd: error: *** STEP 6508.0 ON cnode7-012 CANCELLED AT 2024-12-07T19:13:04 ***