# Running on cnode7-012 # Started at Sat Dec 7 19:13:53 CST 2024 # SLURMD_NODENAME=cnode7-012 # SLURM_CLUSTER_NAME=slurm # SLURM_CONF=/cm/shared/apps/slurm/var/etc/slurm/slurm.conf # SLURM_CPUS_ON_NODE=224 # SLURM_CPUS_PER_TASK=8 # SLURM_EXPORT_ENV=PATH # SLURM_GET_USER_ENV=1 # SLURM_GPUS_ON_NODE=8 # SLURM_GPUS_PER_NODE=8 # SLURM_GTIDS=0 # SLURM_JOBID=6541 # SLURM_JOB_CPUS_PER_NODE='224(x2)' # SLURM_JOB_END_TIME=1765106032 # SLURM_JOB_GID=1026 # SLURM_JOB_GPUS=0,1,2,3,4,5,6,7 # SLURM_JOB_ID=6541 # SLURM_JOB_NAME=exp_owsm/s2t_train_05b_ds_raw_bpe50000/train.log # SLURM_JOB_NODELIST='cnode7-[012-013]' # SLURM_JOB_NUM_NODES=2 # SLURM_JOB_PARTITION=p2 # SLURM_JOB_QOS=normal # SLURM_JOB_START_TIME=1733570032 # SLURM_JOB_UID=1026 # SLURM_JOB_USER=williamchen # SLURM_LOCALID=0 # SLURM_MEM_PER_NODE=2048000 # SLURM_NNODES=2 # SLURM_NODEID=0 # SLURM_NODELIST='cnode7-[012-013]' # SLURM_NODE_ALIASES='(null)' # SLURM_OPEN_MODE=a # SLURM_PRIO_PROCESS=0 # SLURM_PROCID=0 # SLURM_SUBMIT_DIR=/mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1 # SLURM_SUBMIT_HOST=154-T2-P1-NVR # SLURM_TASKS_PER_NODE='28(x2)' # SLURM_TASK_PID=1064636 # SLURM_TOPOLOGY_ADDR=cnode7-012 # SLURM_TOPOLOGY_ADDR_PATTERN=node # SLURM_WORKING_CLUSTER=slurm:154-T2-P1-NVR:6817:9984:109 # srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_3ea3229e-ff5e-4a6f-a49b-fce5c474804c [2024-12-07 19:14:00,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:01,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_3ea3229e-ff5e-4a6f-a49b-fce5c474804c /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/bin/python3 /mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp_owsm/s2t_train_05b_ds_raw_bpe50000 --config conf/train_05b_ds.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp,speech,kaldi_ark --train_shape_file exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev,text_prev,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc,text_ctc,text --fold_length 150 --train_data_path_and_name_and_type exp_owsm/s2t_stats_raw_bpe50000/splits8/text,text,text --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --ngpu 8 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///mnt/home/williamchen/espnet/egs2/owsm_v3.1/s2t1/exp_owsm/s2t_train_05b_ds_raw_bpe50000/.dist_init_3ea3229e-ff5e-4a6f-a49b-fce5c474804c [2024-12-07 19:14:13,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:13,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:13,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:15,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:15,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:14,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:15,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:15,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:15,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:14:15,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [W1207 19:14:21.734505668 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:21.735685698 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:21.458597721 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.558350644 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.735011634 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.977207204 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.025801006 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.413576234 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:23.067438417 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:23.070619749 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.559117155 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.655772630 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.713142187 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:22.716553196 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [cnode7-012:0/16] 2024-12-07 19:14:22,993 (s2t:462) INFO: Vocabulary size: 50002 [W1207 19:14:23.740720224 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [W1207 19:14:23.787795275 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator()) [cnode7-012:0/16] 2024-12-07 19:14:26,728 (abs_task:1383) INFO: pytorch.version=2.4.0+cu121, cuda.available=True, cudnn.version=90100, cudnn.benchmark=False, cudnn.deterministic=True [cnode7-012:0/16] 2024-12-07 19:14:26,733 (abs_task:1384) INFO: Model structure: ESPnetS2TModel( (frontend): DefaultFrontend( (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) (frontend): Frontend() (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) ) (specaug): SpecAug( (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) ) (normalize): GlobalMVN(stats_file=exp_owsm/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) (encoder): TransformerEncoder( (embed): Conv2dSubsampling( (conv): Sequential( (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) (1): ReLU() (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) (3): ReLU() ) (out): Sequential( (0): Linear(in_features=19456, out_features=1024, bias=True) (1): PositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) ) ) (encoders): MultiSequential( (0): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (6): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (7): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (8): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (9): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (10): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (11): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (12): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (13): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (14): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (15): EncoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) ) (decoder): TransformerDecoder( (embed): Sequential( (0): Embedding(50002, 1024) (1): PositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) ) (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (output_layer): Linear(in_features=1024, out_features=50002, bias=True) (decoders): MultiSequential( (0): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (1): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (2): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (3): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (4): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (5): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (6): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (7): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (8): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (9): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (10): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (11): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (12): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (13): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (14): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (15): DecoderLayer( (self_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (src_attn): MultiHeadedAttention( (linear_q): Linear(in_features=1024, out_features=1024, bias=True) (linear_k): Linear(in_features=1024, out_features=1024, bias=True) (linear_v): Linear(in_features=1024, out_features=1024, bias=True) (linear_out): Linear(in_features=1024, out_features=1024, bias=True) (dropout): Identity() (q_norm): Identity() (k_norm): Identity() ) (feed_forward): PositionwiseFeedForward( (w_1): Linear(in_features=1024, out_features=4096, bias=True) (w_2): Linear(in_features=4096, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (activation): ReLU() ) (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) (criterion_att): LabelSmoothingLoss( (criterion): KLDivLoss() ) (ctc): CTC( (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) (ctc_loss): CTCLoss() ) ) Model summary: Class Name: ESPnetS2TModel Total Number of model parameters: 653.37 M Number of trainable parameters: 653.37 M (100.0%) Size: 2.61 GB Type: torch.float32 [cnode7-012:0/16] 2024-12-07 19:14:26,733 (abs_task:1387) INFO: Optimizer: Adadelta ( Parameter Group 0 capturable: False differentiable: False eps: 1e-06 foreach: None lr: 1.0 maximize: False rho: 0.9 weight_decay: 0 ) [cnode7-012:0/16] 2024-12-07 19:14:26,734 (abs_task:1388) INFO: Scheduler: None [cnode7-012:0/16] 2024-12-07 19:14:26,737 (abs_task:1397) INFO: Saving the configuration in exp_owsm/s2t_train_05b_ds_raw_bpe50000/config.yaml [cnode7-012:0/16] 2024-12-07 19:14:31,375 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:33,464] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:33,468] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:33,613] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:33,614] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:33,525] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:33,529] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:33,646] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:33,647] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,002] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,002] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,002] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:34,003] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,159] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,160] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,018] [INFO] [comm.py:652:init_distributed] cdb=None /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,019] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:34,135] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,136] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:34,465] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,466] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,200] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,201] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:34,203] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,204] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [2024-12-07 19:14:34,215] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,216] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,576] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,576] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [2024-12-07 19:14:34,784] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,785] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 [cnode7-012:0/16] 2024-12-07 19:14:34,507 (abs_task:1807) INFO: [valid] dataset: ESPnetDataset( speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"} text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"} text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"} text: {"path": "dump/raw/dev_v3/text", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 19:14:34,507 (abs_task:1808) INFO: [valid] Batch sampler: SortedBatchSampler(N-batch=74743, batch_size=16, shape_file=exp_owsm/s2t_stats_raw_bpe50000/valid/speech_shape, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 19:14:34,513 (abs_task:1809) INFO: [valid] mini-batch sizes summary: N-batch=74743, mean=16.0, min=16, max=17 /mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py:240: InstrumentationWarning: @typechecked only supports instrumenting functions wrapped with @classmethod, @staticmethod or @property -- not typechecking espnet2.train.deepspeed_trainer.DeepSpeedTrainer.valid_one_epoch def valid_one_epoch( [cnode7-012:0/16] 2024-12-07 19:14:34,567 (distributed_utils:129) WARNING: ================================================================= Found OMP_NUM_THREADS=1 in environment variables. With some advanced features, DeepSpeed may have heavy cpu workload so that OMP_NUM_THREADS=1 is not sufficient. Try to increase it in your path.sh ================================================================= [2024-12-07 19:14:34,567] [INFO] [comm.py:652:init_distributed] cdb=None [2024-12-07 19:14:34,568] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed info: version=0.15.3, git-hash=unknown, git-branch=unknown [2024-12-07 19:14:34,568] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 16 cnode7-012:1065350:1065350 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065350:1065350 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065350:1065350 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065350:1065350 [0] NCCL INFO cudaDriverVersion 12020 NCCL version 2.20.5+cuda12.4 cnode7-013:595415:595415 [5] NCCL INFO cudaDriverVersion 12020 cnode7-013:595415:595415 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065355:1065355 [5] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065355:1065355 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065355:1065355 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065355:1065355 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065356:1065356 [6] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065356:1065356 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065356:1065356 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065356:1065356 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065351:1065351 [1] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065351:1065351 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065351:1065351 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065351:1065351 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595412:595412 [2] NCCL INFO cudaDriverVersion 12020 cnode7-013:595412:595412 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595412:595412 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595412:595412 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595414:595414 [4] NCCL INFO cudaDriverVersion 12020 cnode7-013:595414:595414 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595414:595414 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595414:595414 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595417:595417 [7] NCCL INFO cudaDriverVersion 12020 cnode7-013:595417:595417 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595417:595417 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595417:595417 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595413:595413 [3] NCCL INFO cudaDriverVersion 12020 cnode7-013:595413:595413 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595413:595413 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595413:595413 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065353:1065353 [3] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065353:1065353 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065353:1065353 [3] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065353:1065353 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065357:1065357 [7] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065357:1065357 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065357:1065357 [7] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065357:1065357 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065352:1065352 [2] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065352:1065352 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065352:1065352 [2] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065352:1065352 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595410:595410 [0] NCCL INFO cudaDriverVersion 12020 cnode7-013:595410:595410 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595410:595410 [0] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595410:595410 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595416:595416 [6] NCCL INFO cudaDriverVersion 12020 cnode7-013:595416:595416 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595416:595416 [6] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595416:595416 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595411:595411 [1] NCCL INFO cudaDriverVersion 12020 cnode7-013:595411:595411 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595411:595411 [1] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595411:595411 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-013:595415:595415 [5] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.175<0> cnode7-013:595415:595415 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065354:1065354 [4] NCCL INFO cudaDriverVersion 12020 cnode7-012:1065354:1065354 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065354:1065354 [4] NCCL INFO Bootstrap : Using enp170s0np0:10.225.1.174<0> cnode7-012:1065354:1065354 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation cnode7-012:1065355:1066138 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065356:1066142 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065350:1066136 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065351:1066141 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065353:1066137 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065357:1066139 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065352:1066140 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065352:1066140 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065352:1066140 [2] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065352:1066140 [2] NCCL INFO Using network IB cnode7-013:595416:595965 [6] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065354:1066143 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065355:1066138 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065355:1066138 [5] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065355:1066138 [5] NCCL INFO Using network IB cnode7-012:1065356:1066142 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065356:1066142 [6] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065356:1066142 [6] NCCL INFO Using network IB cnode7-012:1065350:1066136 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065350:1066136 [0] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065350:1066136 [0] NCCL INFO Using network IB cnode7-012:1065351:1066141 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065351:1066141 [1] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065351:1066141 [1] NCCL INFO Using network IB cnode7-013:595412:595955 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595414:595963 [4] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595417:595959 [7] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595413:595956 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065353:1066137 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065353:1066137 [3] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065353:1066137 [3] NCCL INFO Using network IB cnode7-012:1065357:1066139 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065357:1066139 [7] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065357:1066139 [7] NCCL INFO Using network IB cnode7-013:595410:595954 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595411:595960 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-013:595411:595960 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595411:595960 [1] NCCL INFO Using non-device net plugin version 0 cnode7-013:595411:595960 [1] NCCL INFO Using network IB cnode7-013:595415:595953 [5] NCCL INFO NCCL_SOCKET_IFNAME set by environment to ^lo,docker,virbr,vmnet,vboxnet cnode7-012:1065354:1066143 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.174<0> cnode7-012:1065354:1066143 [4] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065354:1066143 [4] NCCL INFO Using network IB cnode7-013:595412:595955 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595412:595955 [2] NCCL INFO Using non-device net plugin version 0 cnode7-013:595412:595955 [2] NCCL INFO Using network IB cnode7-013:595414:595963 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595414:595963 [4] NCCL INFO Using non-device net plugin version 0 cnode7-013:595414:595963 [4] NCCL INFO Using network IB cnode7-013:595417:595959 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595417:595959 [7] NCCL INFO Using non-device net plugin version 0 cnode7-013:595417:595959 [7] NCCL INFO Using network IB cnode7-013:595413:595956 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595413:595956 [3] NCCL INFO Using non-device net plugin version 0 cnode7-013:595413:595956 [3] NCCL INFO Using network IB cnode7-013:595410:595954 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595410:595954 [0] NCCL INFO Using non-device net plugin version 0 cnode7-013:595410:595954 [0] NCCL INFO Using network IB cnode7-013:595416:595965 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595416:595965 [6] NCCL INFO Using non-device net plugin version 0 cnode7-013:595416:595965 [6] NCCL INFO Using network IB cnode7-013:595415:595953 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_3:1/IB [2]mlx5_4:1/IB [3]mlx5_5:1/IB [4]mlx5_6:1/IB [5]mlx5_7:1/RoCE [6]mlx5_8:1/IB [7]mlx5_9:1/IB [8]mlx5_10:1/IB [RO]; OOB enp170s0np0:10.225.1.175<0> cnode7-013:595415:595953 [5] NCCL INFO Using non-device net plugin version 0 cnode7-013:595415:595953 [5] NCCL INFO Using network IB cnode7-012:1065356:1066142 [6] NCCL INFO comm 0x555580e21680 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065350:1066136 [0] NCCL INFO comm 0x55556136e200 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065351:1066141 [1] NCCL INFO comm 0x55557b41b070 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595412:595955 [2] NCCL INFO comm 0x55558201c600 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595414:595963 [4] NCCL INFO comm 0x555575e196c0 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595417:595959 [7] NCCL INFO comm 0x55557941dbe0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065353:1066137 [3] NCCL INFO comm 0x555579420960 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595413:595956 [3] NCCL INFO comm 0x55556045ac50 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065357:1066139 [7] NCCL INFO comm 0x555567a24f50 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065352:1066140 [2] NCCL INFO comm 0x555561a24730 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595410:595954 [0] NCCL INFO comm 0x555563fec610 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595416:595965 [6] NCCL INFO comm 0x55556481a940 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595411:595960 [1] NCCL INFO comm 0x555569c1cd60 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x76fcfd9941a39c14 - Init START cnode7-013:595415:595953 [5] NCCL INFO comm 0x55558141ce60 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065354:1066143 [4] NCCL INFO comm 0x55558061abb0 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065355:1066138 [5] NCCL INFO comm 0x555567c18740 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x76fcfd9941a39c14 - Init START cnode7-012:1065356:1066142 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-012:1065353:1066137 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065353:1066137 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-012:1065350:1066136 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065350:1066136 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-012:1065351:1066141 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065351:1066141 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-012:1065352:1066140 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065352:1066140 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-013:595416:595965 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595416:595965 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-013:595415:595953 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595415:595953 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-012:1065355:1066138 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-013:595412:595955 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595412:595955 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-012:1065357:1066139 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-012:1065354:1066143 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-012:1065354:1066143 [4] NCCL INFO comm 0x55558061abb0 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065354:1066143 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 cnode7-012:1065354:1066143 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065352:1066140 [2] NCCL INFO comm 0x555561a24730 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065352:1066140 [2] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065352:1066140 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 cnode7-012:1065352:1066140 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595412:595955 [2] NCCL INFO comm 0x55558201c600 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-013:595412:595955 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9 cnode7-013:595412:595955 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595414:595963 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595414:595963 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-013:595414:595963 [4] NCCL INFO comm 0x555575e196c0 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-013:595414:595963 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11 cnode7-013:595414:595963 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595411:595960 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595411:595960 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-013:595411:595960 [1] NCCL INFO comm 0x555569c1cd60 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-013:595411:595960 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8 cnode7-013:595411:595960 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595416:595965 [6] NCCL INFO comm 0x55556481a940 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-013:595416:595965 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13 cnode7-013:595416:595965 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595413:595956 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595413:595956 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-013:595413:595956 [3] NCCL INFO comm 0x55556045ac50 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-013:595413:595956 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10 cnode7-013:595413:595956 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595415:595953 [5] NCCL INFO comm 0x55558141ce60 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-013:595415:595953 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12 cnode7-013:595415:595953 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595417:595959 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595417:595959 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-013:595417:595959 [7] NCCL INFO comm 0x55557941dbe0 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-013:595417:595959 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1 cnode7-013:595417:595959 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595410:595954 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595410:595954 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-013:595410:595954 [0] NCCL INFO comm 0x555563fec610 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-013:595410:595954 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15 cnode7-013:595410:595954 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065351:1066141 [1] NCCL INFO comm 0x55557b41b070 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065351:1066141 [1] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065351:1066141 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 cnode7-012:1065351:1066141 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065357:1066139 [7] NCCL INFO comm 0x555567a24f50 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065357:1066139 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15 cnode7-012:1065357:1066139 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065350:1066136 [0] NCCL INFO comm 0x55556136e200 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065350:1066136 [0] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:1065350:1066136 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7 cnode7-012:1065350:1066136 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065353:1066137 [3] NCCL INFO comm 0x555579420960 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065353:1066137 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 cnode7-012:1065353:1066137 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065356:1066142 [6] NCCL INFO comm 0x555580e21680 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065356:1066142 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5 cnode7-012:1065356:1066142 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065355:1066138 [5] NCCL INFO comm 0x555567c18740 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065355:1066138 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4 cnode7-012:1065355:1066138 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065352:1066140 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Connected all rings cnode7-012:1065350:1066136 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Connected all rings cnode7-012:1065351:1066141 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Connected all rings cnode7-013:595412:595955 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 10/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 13/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Connected all rings cnode7-013:595414:595963 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 09/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 11/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 12/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 15/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Connected all rings cnode7-012:1065353:1066137 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1066137 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Connected all rings cnode7-013:595413:595956 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 09/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 11/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 13/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 15/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 10/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:595956 [3] NCCL INFO Channel 14/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Connected all rings cnode7-013:595417:595959 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 09/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 11/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 13/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Connected all rings cnode7-012:1065357:1066139 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1066139 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Connected all rings cnode7-012:1065352:1066140 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Connected all rings cnode7-013:595410:595954 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 13/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:595954 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Connected all rings cnode7-013:595416:595965 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 09/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 11/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 13/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 14/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 08/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 10/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:595965 [6] NCCL INFO Channel 12/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Connected all rings cnode7-013:595411:595960 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 05/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 13/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 12/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:595960 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Connected all rings cnode7-013:595415:595953 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 03/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 07/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 11/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 13/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 15/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 08/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 10/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 12/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:595953 [5] NCCL INFO Channel 14/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Connected all rings cnode7-012:1065354:1066143 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1066143 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Connected all rings cnode7-012:1065355:1066138 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1066138 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Connected all rings cnode7-012:1065356:1066142 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1066142 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1066136 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1066141 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-013:595412:595955 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:595963 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 15/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 08/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 10/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 12/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:595959 [7] NCCL INFO Channel 14/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-012:1065352:1066140 [2] NCCL INFO Connected all trees cnode7-013:595412:595955 [2] NCCL INFO Connected all trees cnode7-012:1065352:1066140 [2] NCCL INFO NVLS comm 0x555561a24730 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065353:1066137 [3] NCCL INFO Connected all trees cnode7-013:595413:595956 [3] NCCL INFO Connected all trees cnode7-012:1065354:1066143 [4] NCCL INFO Connected all trees cnode7-013:595415:595953 [5] NCCL INFO Connected all trees cnode7-013:595417:595959 [7] NCCL INFO Connected all trees cnode7-013:595410:595954 [0] NCCL INFO Connected all trees cnode7-012:1065355:1066138 [5] NCCL INFO Connected all trees cnode7-012:1065356:1066142 [6] NCCL INFO Connected all trees cnode7-012:1065350:1066136 [0] NCCL INFO Connected all trees cnode7-012:1065357:1066139 [7] NCCL INFO Connected all trees cnode7-013:595411:595960 [1] NCCL INFO Connected all trees cnode7-013:595416:595965 [6] NCCL INFO Connected all trees cnode7-013:595414:595963 [4] NCCL INFO Connected all trees cnode7-012:1065351:1066141 [1] NCCL INFO Connected all trees cnode7-012:1065350:1066136 [0] NCCL INFO NVLS comm 0x55556136e200 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595412:595955 [2] NCCL INFO NVLS comm 0x55558201c600 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595414:595963 [4] NCCL INFO NVLS comm 0x555575e196c0 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065353:1066137 [3] NCCL INFO NVLS comm 0x555579420960 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595413:595956 [3] NCCL INFO NVLS comm 0x55556045ac50 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595417:595959 [7] NCCL INFO NVLS comm 0x55557941dbe0 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065357:1066139 [7] NCCL INFO NVLS comm 0x555567a24f50 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595410:595954 [0] NCCL INFO NVLS comm 0x555563fec610 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595416:595965 [6] NCCL INFO NVLS comm 0x55556481a940 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595411:595960 [1] NCCL INFO NVLS comm 0x555569c1cd60 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595415:595953 [5] NCCL INFO NVLS comm 0x55558141ce60 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065354:1066143 [4] NCCL INFO NVLS comm 0x55558061abb0 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065355:1066138 [5] NCCL INFO NVLS comm 0x555567c18740 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065356:1066142 [6] NCCL INFO NVLS comm 0x555580e21680 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065350:1066136 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO NVLS comm 0x55557b41b070 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065351:1066141 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:595956 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:595959 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:595954 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:595965 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:595960 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:595953 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:595955 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:595963 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1066140 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1066143 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1066138 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1066142 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1066141 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1066137 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1066139 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065350:1066136 [0] NCCL INFO Connected NVLS tree cnode7-012:1065350:1066136 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065350:1066136 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065352:1066140 [2] NCCL INFO Connected NVLS tree cnode7-012:1065352:1066140 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065352:1066140 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595410:595954 [0] NCCL INFO Connected NVLS tree cnode7-013:595410:595954 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595410:595954 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595416:595965 [6] NCCL INFO Connected NVLS tree cnode7-013:595416:595965 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595416:595965 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595411:595960 [1] NCCL INFO Connected NVLS tree cnode7-013:595411:595960 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595411:595960 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065355:1066138 [5] NCCL INFO Connected NVLS tree cnode7-012:1065355:1066138 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065355:1066138 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595413:595956 [3] NCCL INFO Connected NVLS tree cnode7-013:595413:595956 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595413:595956 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065351:1066141 [1] NCCL INFO Connected NVLS tree cnode7-012:1065351:1066141 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065351:1066141 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065356:1066142 [6] NCCL INFO Connected NVLS tree cnode7-012:1065356:1066142 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065356:1066142 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595415:595953 [5] NCCL INFO Connected NVLS tree cnode7-013:595415:595953 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595415:595953 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065353:1066137 [3] NCCL INFO Connected NVLS tree cnode7-012:1065353:1066137 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065353:1066137 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065354:1066143 [4] NCCL INFO Connected NVLS tree cnode7-012:1065354:1066143 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065354:1066143 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595417:595959 [7] NCCL INFO Connected NVLS tree cnode7-013:595417:595959 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595417:595959 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595412:595955 [2] NCCL INFO Connected NVLS tree cnode7-013:595412:595955 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595412:595955 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065357:1066139 [7] NCCL INFO Connected NVLS tree cnode7-012:1065357:1066139 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065357:1066139 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065355:1066138 [5] NCCL INFO comm 0x555567c18740 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-012:1065351:1066141 [1] NCCL INFO comm 0x55557b41b070 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-012:1065357:1066139 [7] NCCL INFO comm 0x555567a24f50 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-012:1065353:1066137 [3] NCCL INFO comm 0x555579420960 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595414:595963 [4] NCCL INFO Connected NVLS tree cnode7-013:595414:595963 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595414:595963 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065356:1066142 [6] NCCL INFO comm 0x555580e21680 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-012:1065354:1066143 [4] NCCL INFO comm 0x55558061abb0 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595417:595959 [7] NCCL INFO comm 0x55557941dbe0 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-012:1065350:1066136 [0] NCCL INFO comm 0x55556136e200 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595411:595960 [1] NCCL INFO comm 0x555569c1cd60 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595415:595953 [5] NCCL INFO comm 0x55558141ce60 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595414:595963 [4] NCCL INFO comm 0x555575e196c0 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595413:595956 [3] NCCL INFO comm 0x55556045ac50 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595416:595965 [6] NCCL INFO comm 0x55556481a940 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595410:595954 [0] NCCL INFO comm 0x555563fec610 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-013:595412:595955 [2] NCCL INFO comm 0x55558201c600 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x76fcfd9941a39c14 - Init COMPLETE cnode7-012:1065352:1066140 [2] NCCL INFO comm 0x555561a24730 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x76fcfd9941a39c14 - Init COMPLETE [2024-12-07 19:14:51,108] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False [2024-12-07 19:14:51,114] [INFO] [logging.py:129:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adam as basic optimizer [2024-12-07 19:14:51,114] [INFO] [logging.py:129:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer [2024-12-07 19:14:51,172] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam [2024-12-07 19:14:51,172] [INFO] [utils.py:59:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= [2024-12-07 19:14:51,172] [INFO] [logging.py:129:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer [2024-12-07 19:14:51,172] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 500000000 [2024-12-07 19:14:51,172] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 500000000 [2024-12-07 19:14:51,172] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False [2024-12-07 19:14:51,172] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False [2024-12-07 19:14:53,021] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:52,739] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,092] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:53,092] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,093] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,099] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,107] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:52,829] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,140] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:52,842] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:52,908] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:52,909] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,487] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,198] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states [2024-12-07 19:14:53,199] [INFO] [utils.py:782:see_memory_usage] MA 1.37 GB Max_MA 1.45 GB CA 1.45 GB Max_CA 1 GB [2024-12-07 19:14:53,199] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 136.93 GB, percent = 6.8% [2024-12-07 19:14:53,320] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,428] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states [2024-12-07 19:14:53,429] [INFO] [utils.py:782:see_memory_usage] MA 1.37 GB Max_MA 1.52 GB CA 1.6 GB Max_CA 2 GB [2024-12-07 19:14:53,429] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 136.83 GB, percent = 6.8% [2024-12-07 19:14:53,429] [INFO] [stage_1_and_2.py:544:__init__] optimizer state initialized [2024-12-07 19:14:53,598] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:53,655] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer [2024-12-07 19:14:53,655] [INFO] [utils.py:782:see_memory_usage] MA 1.37 GB Max_MA 1.37 GB CA 1.6 GB Max_CA 2 GB [2024-12-07 19:14:53,655] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 136.7 GB, percent = 6.8% [2024-12-07 19:14:53,657] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer [2024-12-07 19:14:53,657] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler [2024-12-07 19:14:53,657] [INFO] [logging.py:129:log_dist] [Rank 0] DeepSpeed LR Scheduler = PiecewiseLinearWarmupLR(warmup_steps_list=[0, 30000, 60000], warmup_lr_list=[0.0, 5e-05, 0.0002]) [2024-12-07 19:14:53,657] [INFO] [logging.py:129:log_dist] [Rank 0] step=0, skipped=0, lr=[np.float64(1.6666666666666667e-09)], mom=[[0.9, 0.98]] [2024-12-07 19:14:53,658] [INFO] [config.py:999:print] DeepSpeedEngine configuration: [2024-12-07 19:14:53,658] [INFO] [config.py:1003:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, "number_checkpoints": null, "synchronize_checkpoint_boundary": false, "profile": false } [2024-12-07 19:14:53,658] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} [2024-12-07 19:14:53,658] [INFO] [config.py:1003:print] amp_enabled .................. False [2024-12-07 19:14:53,658] [INFO] [config.py:1003:print] amp_params ................... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, "metric_path": null, "arg_mappings": null, "metric": "throughput", "model_info": null, "results_dir": "autotuning_results", "exps_dir": "autotuning_exps", "overwrite": true, "fast": true, "start_profile_step": 3, "end_profile_step": 5, "tuner_type": "gridsearch", "tuner_early_stopping": 5, "tuner_num_trials": 50, "model_info_path": null, "mp_size": 1, "max_train_batch_size": null, "min_train_batch_size": 1, "max_train_micro_batch_size_per_gpu": 1.024000e+03, "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] bfloat16_enabled ............. True [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] comms_config ................. [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] communication_data_type ...... None [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] dataloader_drop_last ......... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] disable_allgather ............ False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] dump_state ................... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] elasticity_enabled ........... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] flops_profiler_config ........ { "enabled": false, "recompute_fwd_factor": 0.0, "profile_step": 1, "module_depth": -1, "top_modules": 1, "detailed": true, "output_file": null } [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] fp16_auto_cast ............... None [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] fp16_enabled ................. False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] global_rank .................. 0 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] grad_accum_dtype ............. None [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 1 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] gradient_clipping ............ 5.0 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] graph_harvesting ............. False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 1 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] load_universal_checkpoint .... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] loss_scale ................... 1.0 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] memory_breakdown ............. False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] mics_shard_size .............. -1 [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, "num_of_version_in_retention": 2, "enable_nebula_load": true, "load_path": null } [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] optimizer_name ............... adam [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] optimizer_params ............. {'lr': 0.0002, 'betas': [0.9, 0.98], 'eps': 1e-06, 'weight_decay': 0.0} [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] pld_enabled .................. False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] pld_params ................... False [2024-12-07 19:14:53,660] [INFO] [config.py:1003:print] prescale_gradients ........... False [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] scheduler_name ............... None [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] scheduler_params ............. None [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] sparse_attention ............. None [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] steps_per_print .............. 1000 [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] train_batch_size ............. 16 [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 1 [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] use_node_local_storage ....... False [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] weight_quantization_config ... None [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] world_size ................... 16 [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] zero_enabled ................. True [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True [2024-12-07 19:14:53,661] [INFO] [config.py:1003:print] zero_optimization_stage ...... 2 [2024-12-07 19:14:53,661] [INFO] [config.py:989:print_user_config] json = { "train_micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 1, "gradient_clipping": 5.0, "bf16": { "enabled": true }, "zero_optimization": { "stage": 2, "contiguous_gradients": true, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5.000000e+08, "allgather_bucket_size": 5.000000e+08 }, "optimizer": { "type": "Adam", "params": { "lr": 0.0002, "betas": [0.9, 0.98], "eps": 1e-06, "weight_decay": 0.0 } }, "wall_clock_breakdown": false, "steps_per_print": 1000 } [cnode7-012:0/16] 2024-12-07 19:14:53,662 (deepspeed_trainer:75) INFO: Resume training from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26 [2024-12-07 19:14:53,664] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... /mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py:28: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. partition = torch.load(path, map_location=map_location) [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,226] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,226] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,226] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,226] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,226] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,226] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,562] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,562] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,583] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,585] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,585] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,586] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,587] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,587] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,297] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,307] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,307] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,308] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,309] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,310] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,342] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,406] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,413] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:54,470] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt... [2024-12-07 19:14:55,217] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,217] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,217] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,217] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,218] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,219] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,227] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,229] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,952] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,952] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,954] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,954] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,955] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:54,989] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,141] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,473] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,473] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,171] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/mp_rank_00_model_states.pt. [2024-12-07 19:14:55,494] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,494] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,494] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,494] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,498] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,498] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,219] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,219] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,227] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,276] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,276] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,277] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,678] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,678] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 13 [2024-12-07 19:14:55,687] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,687] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 12 [2024-12-07 19:14:55,387] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,713] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,713] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 14 [2024-12-07 19:14:55,417] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 19:14:55,739] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,739] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 11 [2024-12-07 19:14:55,740] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,740] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 9 [2024-12-07 19:14:55,743] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 13 [2024-12-07 19:14:55,755] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 12 [2024-12-07 19:14:55,760] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,760] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 10 [2024-12-07 19:14:55,760] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,760] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 15 [2024-12-07 19:14:55,768] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 14 [2024-12-07 19:14:55,772] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,773] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 8 [2024-12-07 19:14:55,802] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 11 [2024-12-07 19:14:55,807] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 9 [2024-12-07 19:14:55,821] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 10 [2024-12-07 19:14:55,834] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 8 [2024-12-07 19:14:55,914] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 15 [2024-12-07 19:14:55,951] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 19:14:55,952] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 4 [2024-12-07 19:14:56,011] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 4 [2024-12-07 19:14:56,023] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,023] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 5 [2024-12-07 19:14:56,030] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,031] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 7 [2024-12-07 19:14:56,041] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,041] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 0 [2024-12-07 19:14:56,061] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,061] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 1 [2024-12-07 19:14:56,066] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,066] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 2 [2024-12-07 19:14:56,099] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 7 [2024-12-07 19:14:56,101] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 5 [2024-12-07 19:14:56,125] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 0 [2024-12-07 19:14:56,131] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 1 [2024-12-07 19:14:56,132] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,132] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 3 [2024-12-07 19:14:56,133] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 2 [2024-12-07 19:14:56,134] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_26/26/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 19:14:56,134] [INFO] [engine.py:3076:_get_all_zero_checkpoint_state_dicts] successfully read 16 ZeRO state_dicts for rank 6 [cnode7-012:0/16] 2024-12-07 19:14:56,140 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [2024-12-07 19:14:56,187] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 3 [2024-12-07 19:14:56,189] [INFO] [engine.py:3026:_load_zero_checkpoint] loading 16 zero partition checkpoints for rank 6 [cnode7-012:0/16] 2024-12-07 19:15:22,949 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 19:15:40,292 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 19:15:40,292 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 19:15:40,294 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 19:16:00,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:01,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:02,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:02,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:02,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:02,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:02,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:02,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:03,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:03,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:05,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:05,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:05,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:06,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:06,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:06,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:48,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:49,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:49,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:49,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:49,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:50,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:50,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:50,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:50,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:51,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:53,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:54,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:54,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:55,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:16:56,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:14,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:36,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:37,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:37,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:38,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:38,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:38,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:38,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:40,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:43,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:43,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:43,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:44,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:45,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:45,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:17:46,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:02,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:23,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:23,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:25,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:25,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:26,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:27,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:29,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:33,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:33,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:33,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:34,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:34,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:35,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:35,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:18:37,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): [2024-12-07 19:18:49,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:1065357:1073392 [7] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065357:1073392 [7] NCCL INFO Using network IB cnode7-013:595412:604689 [2] NCCL INFO Using non-device net plugin version 0 cnode7-013:595412:604689 [2] NCCL INFO Using network IB cnode7-013:595414:604684 [4] NCCL INFO Using non-device net plugin version 0 cnode7-013:595414:604684 [4] NCCL INFO Using network IB cnode7-013:595413:604685 [3] NCCL INFO Using non-device net plugin version 0 cnode7-013:595413:604685 [3] NCCL INFO Using network IB cnode7-013:595417:604688 [7] NCCL INFO Using non-device net plugin version 0 cnode7-013:595417:604688 [7] NCCL INFO Using network IB cnode7-013:595416:604687 [6] NCCL INFO Using non-device net plugin version 0 cnode7-013:595416:604687 [6] NCCL INFO Using network IB cnode7-013:595411:604683 [1] NCCL INFO Using non-device net plugin version 0 cnode7-013:595411:604683 [1] NCCL INFO Using network IB cnode7-013:595410:604686 [0] NCCL INFO Using non-device net plugin version 0 cnode7-013:595410:604686 [0] NCCL INFO Using network IB cnode7-012:1065350:1073390 [0] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065350:1073390 [0] NCCL INFO Using network IB cnode7-012:1065355:1073463 [5] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065355:1073463 [5] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:1065351:1073686 [1] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065351:1073686 [1] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:1065352:1073766 [2] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065352:1073766 [2] NCCL INFO Using network IB cnode7-012:1065353:1073779 [3] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065353:1073779 [3] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-013:595415:604772 [5] NCCL INFO Using non-device net plugin version 0 cnode7-013:595415:604772 [5] NCCL INFO Using network IB cnode7-012:1065354:1073901 [4] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065354:1073901 [4] NCCL INFO Using network IB /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): cnode7-012:1065354:1073901 [4] NCCL INFO comm 0x5555c11bcf20 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595416:604687 [6] NCCL INFO comm 0x5555d51cf510 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595411:604683 [1] NCCL INFO comm 0x5555c11d01f0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065355:1073463 [5] NCCL INFO comm 0x5555d51e2c30 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595415:604772 [5] NCCL INFO comm 0x5555c11d89d0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595410:604686 [0] NCCL INFO comm 0x5555b71d18d0 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065356:1074361 [6] NCCL INFO Using non-device net plugin version 0 cnode7-012:1065356:1074361 [6] NCCL INFO Using network IB cnode7-012:1065356:1074361 [6] NCCL INFO comm 0x5555c119a770 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065350:1073390 [0] NCCL INFO comm 0x5555b71c1940 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065351:1073686 [1] NCCL INFO comm 0x5555c11cc430 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065353:1073779 [3] NCCL INFO comm 0x5555c11eafa0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065357:1073392 [7] NCCL INFO comm 0x5555c11be5e0 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595412:604689 [2] NCCL INFO comm 0x5555c11d2200 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595414:604684 [4] NCCL INFO comm 0x5555c11d8790 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595413:604685 [3] NCCL INFO comm 0x5555bea42f00 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x4f9bc01e40954c2b - Init START cnode7-013:595417:604688 [7] NCCL INFO comm 0x5555c11a5720 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065352:1073766 [2] NCCL INFO comm 0x5555c11d81c0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x4f9bc01e40954c2b - Init START cnode7-012:1065350:1073390 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065350:1073390 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-012:1065352:1073766 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065352:1073766 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-012:1065355:1073463 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-012:1065351:1073686 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065351:1073686 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-012:1065353:1073779 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-012:1065353:1073779 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-012:1065357:1073392 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-012:1065354:1073901 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-012:1065356:1074361 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-013:595411:604683 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595411:604683 [1] NCCL INFO NVLS multicast support is available on dev 1 cnode7-013:595417:604688 [7] NCCL INFO Setting affinity for GPU 7 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595417:604688 [7] NCCL INFO NVLS multicast support is available on dev 7 cnode7-013:595416:604687 [6] NCCL INFO Setting affinity for GPU 6 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595416:604687 [6] NCCL INFO NVLS multicast support is available on dev 6 cnode7-013:595415:604772 [5] NCCL INFO Setting affinity for GPU 5 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595415:604772 [5] NCCL INFO NVLS multicast support is available on dev 5 cnode7-013:595410:604686 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595410:604686 [0] NCCL INFO NVLS multicast support is available on dev 0 cnode7-012:1065357:1073392 [7] NCCL INFO comm 0x5555c11be5e0 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065357:1073392 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15 cnode7-012:1065357:1073392 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595413:604685 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595413:604685 [3] NCCL INFO NVLS multicast support is available on dev 3 cnode7-013:595413:604685 [3] NCCL INFO comm 0x5555bea42f00 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-013:595413:604685 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10 cnode7-013:595413:604685 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595416:604687 [6] NCCL INFO comm 0x5555d51cf510 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-013:595416:604687 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13 cnode7-013:595416:604687 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065352:1073766 [2] NCCL INFO comm 0x5555c11d81c0 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065352:1073766 [2] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065352:1073766 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 cnode7-012:1065352:1073766 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065354:1073901 [4] NCCL INFO comm 0x5555c11bcf20 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065354:1073901 [4] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065354:1073901 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 cnode7-012:1065354:1073901 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065351:1073686 [1] NCCL INFO comm 0x5555c11cc430 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065351:1073686 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 cnode7-012:1065351:1073686 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595412:604689 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff,ffff0000,00000000,00ffffff,ffffffff cnode7-013:595412:604689 [2] NCCL INFO NVLS multicast support is available on dev 2 cnode7-013:595412:604689 [2] NCCL INFO comm 0x5555c11d2200 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 cnode7-013:595412:604689 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9 cnode7-013:595412:604689 [2] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595417:604688 [7] NCCL INFO comm 0x5555c11a5720 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 cnode7-013:595417:604688 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1 cnode7-013:595417:604688 [7] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065356:1074361 [6] NCCL INFO comm 0x5555c119a770 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065356:1074361 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5 cnode7-012:1065356:1074361 [6] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595414:604684 [4] NCCL INFO Setting affinity for GPU 4 to ffffffff,ffffff00,00000000,0000ffff,ffffffff,ff000000,00000000 cnode7-013:595414:604684 [4] NCCL INFO NVLS multicast support is available on dev 4 cnode7-013:595414:604684 [4] NCCL INFO comm 0x5555c11d8790 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 cnode7-013:595414:604684 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11 cnode7-013:595414:604684 [4] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065353:1073779 [3] NCCL INFO comm 0x5555c11eafa0 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065353:1073779 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 cnode7-012:1065353:1073779 [3] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595415:604772 [5] NCCL INFO comm 0x5555c11d89d0 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-013:595415:604772 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12 cnode7-013:595415:604772 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065355:1073463 [5] NCCL INFO comm 0x5555d51e2c30 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065355:1073463 [5] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065355:1073463 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4 cnode7-012:1065355:1073463 [5] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065350:1073390 [0] NCCL INFO comm 0x5555b71c1940 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 0: 0 8 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 1: 1 9 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 2: 2 10 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 3: 3 11 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 4: 4 12 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 5: 5 13 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 6: 6 14 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS Head 7: 7 15 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 cnode7-012:1065350:1073390 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 cnode7-012:1065350:1073390 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7 cnode7-012:1065350:1073390 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595411:604683 [1] NCCL INFO comm 0x5555c11d01f0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 cnode7-013:595411:604683 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8 cnode7-013:595411:604683 [1] NCCL INFO P2P Chunksize set to 131072 cnode7-013:595410:604686 [0] NCCL INFO comm 0x5555b71d18d0 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 cnode7-013:595410:604686 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15 cnode7-013:595410:604686 [0] NCCL INFO P2P Chunksize set to 131072 cnode7-012:1065356:1074361 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Connected all rings cnode7-013:595412:604689 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 10/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 13/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Connected all rings cnode7-013:595413:604685 [3] NCCL INFO Connected all rings cnode7-013:595413:604685 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Connected all rings cnode7-012:1065352:1073766 [2] NCCL INFO Connected all rings cnode7-012:1065352:1073766 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Connected all rings cnode7-012:1065354:1073901 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Connected all rings cnode7-013:595416:604687 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 09/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 11/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 13/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 14/0 : 14[6] -> 15[7] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 08/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Connected all rings cnode7-013:595411:604683 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 05/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 13/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 12/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-013:595411:604683 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Connected all rings cnode7-012:1065355:1073463 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Connected all rings cnode7-013:595415:604772 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 03/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 07/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 11/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 13/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 15/0 : 13[5] -> 14[6] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 08/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 10/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 12/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595415:604772 [5] NCCL INFO Channel 14/0 : 13[5] -> 12[4] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Connected all rings cnode7-013:595410:604686 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 13/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM cnode7-013:595410:604686 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Connected all rings cnode7-012:1065356:1074361 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065356:1074361 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Connected all rings cnode7-012:1065350:1073390 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM cnode7-012:1065350:1073390 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Connected all rings cnode7-012:1065351:1073686 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065351:1073686 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Connected all rings cnode7-012:1065353:1073779 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065353:1073779 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Connected all rings cnode7-012:1065357:1073392 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-012:1065357:1073392 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 09/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 11/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 12/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 15/0 : 12[4] -> 13[5] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595414:604684 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 09/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 11/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 13/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 15/0 : 11[3] -> 12[4] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 10/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595413:604685 [3] NCCL INFO Channel 14/0 : 11[3] -> 10[2] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 09/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 11/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 13/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 15/0 : 15[7] -> 8[0] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 08/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 10/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 12/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-013:595417:604688 [7] NCCL INFO Channel 14/0 : 15[7] -> 14[6] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065352:1073766 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-012:1065354:1073901 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 10/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-013:595416:604687 [6] NCCL INFO Channel 12/0 : 14[6] -> 13[5] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-012:1065355:1073463 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM cnode7-013:595412:604689 [2] NCCL INFO Connected all trees cnode7-012:1065352:1073766 [2] NCCL INFO Connected all trees cnode7-013:595413:604685 [3] NCCL INFO Connected all trees cnode7-012:1065353:1073779 [3] NCCL INFO Connected all trees cnode7-013:595412:604689 [2] NCCL INFO NVLS comm 0x5555c11d2200 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595413:604685 [3] NCCL INFO NVLS comm 0x5555bea42f00 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595411:604683 [1] NCCL INFO Connected all trees cnode7-013:595415:604772 [5] NCCL INFO Connected all trees cnode7-013:595416:604687 [6] NCCL INFO Connected all trees cnode7-013:595414:604684 [4] NCCL INFO Connected all trees cnode7-013:595417:604688 [7] NCCL INFO Connected all trees cnode7-013:595410:604686 [0] NCCL INFO Connected all trees cnode7-012:1065354:1073901 [4] NCCL INFO Connected all trees cnode7-012:1065355:1073463 [5] NCCL INFO Connected all trees cnode7-012:1065356:1074361 [6] NCCL INFO Connected all trees cnode7-012:1065357:1073392 [7] NCCL INFO Connected all trees cnode7-012:1065350:1073390 [0] NCCL INFO Connected all trees cnode7-012:1065351:1073686 [1] NCCL INFO Connected all trees cnode7-012:1065352:1073766 [2] NCCL INFO NVLS comm 0x5555c11d81c0 headRank 2 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595416:604687 [6] NCCL INFO NVLS comm 0x5555d51cf510 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595411:604683 [1] NCCL INFO NVLS comm 0x5555c11d01f0 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595415:604772 [5] NCCL INFO NVLS comm 0x5555c11d89d0 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595410:604686 [0] NCCL INFO NVLS comm 0x5555b71d18d0 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065356:1074361 [6] NCCL INFO NVLS comm 0x5555c119a770 headRank 6 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065350:1073390 [0] NCCL INFO NVLS comm 0x5555b71c1940 headRank 0 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065351:1073686 [1] NCCL INFO NVLS comm 0x5555c11cc430 headRank 1 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065353:1073779 [3] NCCL INFO NVLS comm 0x5555c11eafa0 headRank 3 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-012:1065357:1073392 [7] NCCL INFO NVLS comm 0x5555c11be5e0 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595414:604684 [4] NCCL INFO NVLS comm 0x5555c11d8790 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595417:604688 [7] NCCL INFO NVLS comm 0x5555c11a5720 headRank 7 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595417:604688 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-013:595417:604688 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO NVLS comm 0x5555c11bcf20 headRank 4 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595416:604687 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595416:604687 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IB/7/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-013:595411:604683 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO NVLS comm 0x5555d51e2c30 headRank 5 nHeads 8 buffSize 4194304 memSize 2097152 nvlsPerRankSize 201326592 nvlsTotalSize 1610612736 cnode7-013:595415:604772 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595415:604772 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IB/6/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-013:595410:604686 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [receive] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065356:1074361 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [send] via NET/IB/7/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065350:1073390 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065351:1073686 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [receive] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-012:1065357:1073392 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [send] via NET/IB/8/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595412:604689 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595414:604684 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IB/4/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-013:595413:604685 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065352:1073766 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [receive] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065354:1073901 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [send] via NET/IB/4/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065355:1073463 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [send] via NET/IB/6/GDRDMA cnode7-012:1065353:1073779 [3] NCCL INFO Connected NVLS tree cnode7-012:1065353:1073779 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065353:1073779 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065350:1073390 [0] NCCL INFO Connected NVLS tree cnode7-012:1065350:1073390 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065350:1073390 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065352:1073766 [2] NCCL INFO Connected NVLS tree cnode7-012:1065352:1073766 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065352:1073766 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065355:1073463 [5] NCCL INFO Connected NVLS tree cnode7-012:1065355:1073463 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065355:1073463 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065357:1073392 [7] NCCL INFO Connected NVLS tree cnode7-012:1065357:1073392 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065357:1073392 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595412:604689 [2] NCCL INFO Connected NVLS tree cnode7-013:595412:604689 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595412:604689 [2] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595414:604684 [4] NCCL INFO Connected NVLS tree cnode7-013:595414:604684 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595414:604684 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595410:604686 [0] NCCL INFO Connected NVLS tree cnode7-013:595410:604686 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595410:604686 [0] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595413:604685 [3] NCCL INFO Connected NVLS tree cnode7-013:595413:604685 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595413:604685 [3] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595415:604772 [5] NCCL INFO Connected NVLS tree cnode7-013:595415:604772 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595415:604772 [5] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595417:604688 [7] NCCL INFO Connected NVLS tree cnode7-013:595417:604688 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595417:604688 [7] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065356:1074361 [6] NCCL INFO Connected NVLS tree cnode7-012:1065356:1074361 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065356:1074361 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595416:604687 [6] NCCL INFO Connected NVLS tree cnode7-013:595416:604687 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595416:604687 [6] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595411:604683 [1] NCCL INFO Connected NVLS tree cnode7-013:595411:604683 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-013:595411:604683 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065354:1073901 [4] NCCL INFO Connected NVLS tree cnode7-012:1065354:1073901 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065354:1073901 [4] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-012:1065351:1073686 [1] NCCL INFO Connected NVLS tree cnode7-012:1065351:1073686 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 cnode7-012:1065351:1073686 [1] NCCL INFO 16 coll channels, 0 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer cnode7-013:595416:604687 [6] NCCL INFO comm 0x5555d51cf510 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595417:604688 [7] NCCL INFO comm 0x5555c11a5720 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595413:604685 [3] NCCL INFO comm 0x5555bea42f00 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595412:604689 [2] NCCL INFO comm 0x5555c11d2200 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595410:604686 [0] NCCL INFO comm 0x5555b71d18d0 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595411:604683 [1] NCCL INFO comm 0x5555c11d01f0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595415:604772 [5] NCCL INFO comm 0x5555c11d89d0 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-013:595414:604684 [4] NCCL INFO comm 0x5555c11d8790 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065354:1073901 [4] NCCL INFO comm 0x5555c11bcf20 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId 9d000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065356:1074361 [6] NCCL INFO comm 0x5555c119a770 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId d1000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065353:1073779 [3] NCCL INFO comm 0x5555c11eafa0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 61000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065351:1073686 [1] NCCL INFO comm 0x5555c11cc430 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 43000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065355:1073463 [5] NCCL INFO comm 0x5555d51e2c30 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId c3000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065357:1073392 [7] NCCL INFO comm 0x5555c11be5e0 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId df000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065350:1073390 [0] NCCL INFO comm 0x5555b71c1940 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x4f9bc01e40954c2b - Init COMPLETE cnode7-012:1065352:1073766 [2] NCCL INFO comm 0x5555c11d81c0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 52000 commId 0x4f9bc01e40954c2b - Init COMPLETE [cnode7-012:0/16] 2024-12-07 19:19:57,717 (deepspeed_trainer:228) INFO: 27epoch:train:1-100batch: iter_time=2.450, loss_ctc=69.770, loss_att=57.294, acc=0.703, loss=61.030, grad_norm=5.921, loss_scale=1.000, learning_rate=7.844e-05, step_time=0.568 [cnode7-012:0/16] 2024-12-07 19:20:30,973 (deepspeed_trainer:228) INFO: 27epoch:train:101-200batch: iter_time=1.121e-04, loss_ctc=77.836, loss_att=59.834, acc=0.701, loss=65.223, grad_norm=5.396, loss_scale=1.000, learning_rate=7.843e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:21:04,070 (deepspeed_trainer:228) INFO: 27epoch:train:201-300batch: iter_time=1.166e-04, loss_ctc=68.770, loss_att=50.175, acc=0.724, loss=55.777, grad_norm=4.634, loss_scale=1.000, learning_rate=7.842e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 19:21:36,771 (deepspeed_trainer:228) INFO: 27epoch:train:301-400batch: iter_time=1.136e-04, loss_ctc=72.112, loss_att=55.761, acc=0.699, loss=60.667, grad_norm=5.278, loss_scale=1.000, learning_rate=7.841e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 19:22:09,719 (deepspeed_trainer:228) INFO: 27epoch:train:401-500batch: iter_time=1.129e-04, loss_ctc=77.252, loss_att=56.361, acc=0.711, loss=62.627, grad_norm=4.975, loss_scale=1.000, learning_rate=7.840e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 19:22:42,115 (deepspeed_trainer:228) INFO: 27epoch:train:501-600batch: iter_time=1.132e-04, loss_ctc=71.929, loss_att=49.445, acc=0.715, loss=56.222, grad_norm=4.942, loss_scale=1.000, learning_rate=7.839e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-07 19:23:14,959 (deepspeed_trainer:228) INFO: 27epoch:train:601-700batch: iter_time=1.119e-04, loss_ctc=75.913, loss_att=58.921, acc=0.706, loss=64.018, grad_norm=5.708, loss_scale=1.000, learning_rate=7.838e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 19:23:47,561 (deepspeed_trainer:228) INFO: 27epoch:train:701-800batch: iter_time=1.138e-04, loss_ctc=76.627, loss_att=58.875, acc=0.703, loss=64.158, grad_norm=4.926, loss_scale=1.000, learning_rate=7.837e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 19:24:20,217 (deepspeed_trainer:228) INFO: 27epoch:train:801-900batch: iter_time=1.147e-04, loss_ctc=81.258, loss_att=59.575, acc=0.715, loss=66.061, grad_norm=5.207, loss_scale=1.000, learning_rate=7.836e-05, step_time=0.326 [2024-12-07 19:24:52,987] [INFO] [logging.py:129:log_dist] [Rank 0] step=391000, skipped=0, lr=[np.float64(7.83459744877521e-05)], mom=[[0.9, 0.98]] [2024-12-07 19:24:52,987] [INFO] [timer.py:264:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=49.93772367390467, CurrSamplesPerSec=51.66404322999303, MemAllocated=1.77GB, MaxMemAllocated=25.4GB [cnode7-012:0/16] 2024-12-07 19:24:52,990 (deepspeed_trainer:228) INFO: 27epoch:train:901-1000batch: iter_time=1.158e-04, loss_ctc=73.520, loss_att=55.288, acc=0.719, loss=60.752, grad_norm=5.030, loss_scale=1.000, learning_rate=7.835e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 19:25:25,290 (deepspeed_trainer:228) INFO: 27epoch:train:1001-1100batch: iter_time=1.153e-04, loss_ctc=76.666, loss_att=61.604, acc=0.694, loss=66.088, grad_norm=5.984, loss_scale=1.000, learning_rate=7.834e-05, step_time=0.323 [cnode7-012:0/16] 2024-12-07 19:25:57,793 (deepspeed_trainer:228) INFO: 27epoch:train:1101-1200batch: iter_time=1.110e-04, loss_ctc=80.745, loss_att=61.429, acc=0.708, loss=67.206, grad_norm=5.247, loss_scale=1.000, learning_rate=7.833e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 19:26:30,259 (deepspeed_trainer:228) INFO: 27epoch:train:1201-1300batch: iter_time=1.159e-04, loss_ctc=72.427, loss_att=56.618, acc=0.702, loss=61.391, grad_norm=4.897, loss_scale=1.000, learning_rate=7.832e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-07 19:27:02,746 (deepspeed_trainer:228) INFO: 27epoch:train:1301-1400batch: iter_time=1.126e-04, loss_ctc=71.724, loss_att=54.076, acc=0.713, loss=59.354, grad_norm=5.525, loss_scale=1.000, learning_rate=7.831e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-07 19:27:35,063 (deepspeed_trainer:228) INFO: 27epoch:train:1401-1500batch: iter_time=1.169e-04, loss_ctc=73.478, loss_att=52.569, acc=0.709, loss=58.857, grad_norm=5.046, loss_scale=1.000, learning_rate=7.830e-05, step_time=0.323 [cnode7-012:0/16] 2024-12-07 19:28:07,881 (deepspeed_trainer:228) INFO: 27epoch:train:1501-1600batch: iter_time=1.167e-04, loss_ctc=80.118, loss_att=63.938, acc=0.697, loss=68.782, grad_norm=5.645, loss_scale=1.000, learning_rate=7.829e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 19:28:40,632 (deepspeed_trainer:228) INFO: 27epoch:train:1601-1700batch: iter_time=1.146e-04, loss_ctc=74.580, loss_att=63.839, acc=0.700, loss=67.054, grad_norm=5.194, loss_scale=1.000, learning_rate=7.828e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 19:29:13,394 (deepspeed_trainer:228) INFO: 27epoch:train:1701-1800batch: iter_time=1.160e-04, loss_ctc=77.057, loss_att=58.468, acc=0.702, loss=64.054, grad_norm=5.440, loss_scale=1.000, learning_rate=7.827e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 19:29:42,988 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 19:30:09,433 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 19:30:25,807 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 19:30:25,807 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 19:30:25,810 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 19:30:52,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:52,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:52,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:53,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:54,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:53,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:54,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:54,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:54,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:54,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:54,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:55,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:55,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:56,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:56,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:30:58,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:42,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:42,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:42,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:43,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:43,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:43,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:43,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:44,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:44,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:44,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:45,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:45,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:46,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:46,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:46,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:31:46,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:31,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:31,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:32,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:32,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:32,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:33,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:33,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:33,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:34,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:34,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:34,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:34,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:34,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:34,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:35,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:32:38,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:18,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:19,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:21,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:21,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:21,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:21,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:21,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:21,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:22,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:22,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:22,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:22,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:22,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:23,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:23,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:33:32,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 19:34:06,409 (deepspeed_trainer:228) INFO: 27epoch:train:1801-1900batch: iter_time=2.527, loss_ctc=78.835, loss_att=56.197, acc=0.708, loss=62.993, grad_norm=6.834, loss_scale=1.000, learning_rate=7.826e-05, step_time=0.402 [2024-12-07 19:34:39,859] [INFO] [logging.py:129:log_dist] [Rank 0] step=392000, skipped=0, lr=[np.float64(7.824597984011309e-05)], mom=[[0.9, 0.98]] [2024-12-07 19:34:39,860] [INFO] [timer.py:264:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=49.563893814830074, CurrSamplesPerSec=49.48621823612641, MemAllocated=1.77GB, MaxMemAllocated=25.4GB [cnode7-012:0/16] 2024-12-07 19:34:39,863 (deepspeed_trainer:228) INFO: 27epoch:train:1901-2000batch: iter_time=1.107e-04, loss_ctc=65.960, loss_att=50.302, acc=0.714, loss=54.983, grad_norm=4.853, loss_scale=1.000, learning_rate=7.825e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 19:35:13,175 (deepspeed_trainer:228) INFO: 27epoch:train:2001-2100batch: iter_time=1.097e-04, loss_ctc=74.385, loss_att=57.510, acc=0.711, loss=62.580, grad_norm=5.130, loss_scale=1.000, learning_rate=7.824e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 19:35:44,593 (deepspeed_trainer:228) INFO: 27epoch:train:2101-2200batch: iter_time=1.111e-04, loss_ctc=66.828, loss_att=48.503, acc=0.724, loss=53.966, grad_norm=4.310, loss_scale=1.000, learning_rate=7.823e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 19:36:17,586 (deepspeed_trainer:228) INFO: 27epoch:train:2201-2300batch: iter_time=1.123e-04, loss_ctc=75.536, loss_att=55.131, acc=0.713, loss=61.263, grad_norm=5.274, loss_scale=1.000, learning_rate=7.822e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 19:36:50,723 (deepspeed_trainer:228) INFO: 27epoch:train:2301-2400batch: iter_time=1.120e-04, loss_ctc=78.631, loss_att=56.051, acc=0.712, loss=62.830, grad_norm=5.121, loss_scale=1.000, learning_rate=7.821e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 19:37:23,790 (deepspeed_trainer:228) INFO: 27epoch:train:2401-2500batch: iter_time=1.116e-04, loss_ctc=69.240, loss_att=49.903, acc=0.715, loss=55.703, grad_norm=4.769, loss_scale=1.000, learning_rate=7.820e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 19:37:56,762 (deepspeed_trainer:228) INFO: 27epoch:train:2501-2600batch: iter_time=1.109e-04, loss_ctc=76.512, loss_att=58.525, acc=0.711, loss=63.959, grad_norm=5.766, loss_scale=1.000, learning_rate=7.819e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 19:38:29,631 (deepspeed_trainer:228) INFO: 27epoch:train:2601-2700batch: iter_time=1.102e-04, loss_ctc=78.209, loss_att=60.199, acc=0.701, loss=65.595, grad_norm=5.242, loss_scale=1.000, learning_rate=7.818e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 19:39:02,998 (deepspeed_trainer:228) INFO: 27epoch:train:2701-2800batch: iter_time=1.162e-04, loss_ctc=79.061, loss_att=56.935, acc=0.722, loss=63.575, grad_norm=4.981, loss_scale=1.000, learning_rate=7.817e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 19:39:36,435 (deepspeed_trainer:228) INFO: 27epoch:train:2801-2900batch: iter_time=1.177e-04, loss_ctc=74.963, loss_att=57.633, acc=0.711, loss=62.836, grad_norm=4.859, loss_scale=1.000, learning_rate=7.816e-05, step_time=0.334 [2024-12-07 19:40:09,645] [INFO] [logging.py:129:log_dist] [Rank 0] step=393000, skipped=0, lr=[np.float64(7.814636709366069e-05)], mom=[[0.9, 0.98]] [2024-12-07 19:40:09,645] [INFO] [timer.py:264:stop] epoch=0/micro_step=3000/global_step=3000, RunningAvgSamplesPerSec=49.73825052623512, CurrSamplesPerSec=48.03711230267615, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 19:40:09,649 (deepspeed_trainer:228) INFO: 27epoch:train:2901-3000batch: iter_time=1.200e-04, loss_ctc=73.689, loss_att=56.032, acc=0.708, loss=61.339, grad_norm=4.782, loss_scale=1.000, learning_rate=7.815e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:40:42,821 (deepspeed_trainer:228) INFO: 27epoch:train:3001-3100batch: iter_time=1.345e-04, loss_ctc=75.129, loss_att=56.802, acc=0.717, loss=62.328, grad_norm=4.989, loss_scale=1.000, learning_rate=7.814e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:41:15,481 (deepspeed_trainer:228) INFO: 27epoch:train:3101-3200batch: iter_time=1.132e-04, loss_ctc=73.858, loss_att=58.421, acc=0.702, loss=63.040, grad_norm=4.660, loss_scale=1.000, learning_rate=7.813e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 19:41:48,034 (deepspeed_trainer:228) INFO: 27epoch:train:3201-3300batch: iter_time=1.121e-04, loss_ctc=73.397, loss_att=57.474, acc=0.705, loss=62.258, grad_norm=4.594, loss_scale=1.000, learning_rate=7.812e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 19:42:20,778 (deepspeed_trainer:228) INFO: 27epoch:train:3301-3400batch: iter_time=1.117e-04, loss_ctc=72.057, loss_att=51.907, acc=0.714, loss=57.962, grad_norm=5.161, loss_scale=1.000, learning_rate=7.811e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 19:42:53,568 (deepspeed_trainer:228) INFO: 27epoch:train:3401-3500batch: iter_time=1.117e-04, loss_ctc=78.478, loss_att=64.554, acc=0.696, loss=68.739, grad_norm=5.398, loss_scale=1.000, learning_rate=7.810e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 19:43:26,186 (deepspeed_trainer:228) INFO: 27epoch:train:3501-3600batch: iter_time=1.096e-04, loss_ctc=72.174, loss_att=58.007, acc=0.707, loss=62.268, grad_norm=5.110, loss_scale=1.000, learning_rate=7.809e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 19:43:58,849 (deepspeed_trainer:228) INFO: 27epoch:train:3601-3700batch: iter_time=1.117e-04, loss_ctc=73.974, loss_att=54.917, acc=0.708, loss=60.621, grad_norm=5.253, loss_scale=1.000, learning_rate=7.808e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 19:44:19,387 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 19:44:46,163 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 19:45:03,285 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 19:45:03,286 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 19:45:03,288 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 19:45:28,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:29,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:29,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:28,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:28,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:29,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:30,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:31,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:31,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:31,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:32,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:32,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:32,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:32,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:33,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:45:33,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:16,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:15,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:15,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:17,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:18,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:18,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:19,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:20,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:20,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:20,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:19,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:21,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:21,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:22,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:22,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:46:22,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:02,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:05,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:03,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:05,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:05,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:04,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:08,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:08,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:09,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:10,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:11,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:10,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:10,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:10,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:12,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:11,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:49,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:52,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:51,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:52,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:51,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:53,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:55,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:56,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:57,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:59,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:59,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:47:59,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:48:01,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:48:00,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:48:01,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 19:48:03,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 19:48:43,904 (deepspeed_trainer:228) INFO: 27epoch:train:3701-3800batch: iter_time=2.497, loss_ctc=76.988, loss_att=55.222, acc=0.712, loss=61.744, grad_norm=6.060, loss_scale=1.000, learning_rate=7.807e-05, step_time=0.353 [cnode7-012:0/16] 2024-12-07 19:49:16,816 (deepspeed_trainer:228) INFO: 27epoch:train:3801-3900batch: iter_time=1.092e-04, loss_ctc=68.100, loss_att=53.399, acc=0.712, loss=57.825, grad_norm=4.768, loss_scale=1.000, learning_rate=7.806e-05, step_time=0.329 [2024-12-07 19:49:49,892] [INFO] [logging.py:129:log_dist] [Rank 0] step=394000, skipped=0, lr=[np.float64(7.804713382362935e-05)], mom=[[0.9, 0.98]] [2024-12-07 19:49:49,893] [INFO] [timer.py:264:stop] epoch=0/micro_step=4000/global_step=4000, RunningAvgSamplesPerSec=49.79393329985911, CurrSamplesPerSec=47.89575853278645, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 19:49:49,896 (deepspeed_trainer:228) INFO: 27epoch:train:3901-4000batch: iter_time=1.137e-04, loss_ctc=73.101, loss_att=54.759, acc=0.722, loss=60.268, grad_norm=4.953, loss_scale=1.000, learning_rate=7.805e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 19:50:22,983 (deepspeed_trainer:228) INFO: 27epoch:train:4001-4100batch: iter_time=1.179e-04, loss_ctc=71.015, loss_att=52.862, acc=0.719, loss=58.297, grad_norm=4.846, loss_scale=1.000, learning_rate=7.804e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 19:50:56,457 (deepspeed_trainer:228) INFO: 27epoch:train:4101-4200batch: iter_time=1.164e-04, loss_ctc=71.766, loss_att=51.937, acc=0.716, loss=57.901, grad_norm=4.985, loss_scale=1.000, learning_rate=7.803e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 19:51:29,959 (deepspeed_trainer:228) INFO: 27epoch:train:4201-4300batch: iter_time=1.148e-04, loss_ctc=80.511, loss_att=57.894, acc=0.717, loss=64.645, grad_norm=5.461, loss_scale=1.000, learning_rate=7.802e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 19:52:02,905 (deepspeed_trainer:228) INFO: 27epoch:train:4301-4400batch: iter_time=1.159e-04, loss_ctc=60.652, loss_att=43.994, acc=0.717, loss=48.991, grad_norm=4.518, loss_scale=1.000, learning_rate=7.801e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 19:52:36,195 (deepspeed_trainer:228) INFO: 27epoch:train:4401-4500batch: iter_time=1.195e-04, loss_ctc=84.907, loss_att=66.924, acc=0.702, loss=72.321, grad_norm=6.288, loss_scale=1.000, learning_rate=7.800e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:53:09,452 (deepspeed_trainer:228) INFO: 27epoch:train:4501-4600batch: iter_time=1.182e-04, loss_ctc=77.547, loss_att=56.998, acc=0.714, loss=63.172, grad_norm=4.850, loss_scale=1.000, learning_rate=7.799e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:53:42,725 (deepspeed_trainer:228) INFO: 27epoch:train:4601-4700batch: iter_time=1.194e-04, loss_ctc=76.265, loss_att=55.496, acc=0.716, loss=61.764, grad_norm=4.652, loss_scale=1.000, learning_rate=7.798e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:54:15,747 (deepspeed_trainer:228) INFO: 27epoch:train:4701-4800batch: iter_time=1.180e-04, loss_ctc=77.483, loss_att=61.674, acc=0.707, loss=66.431, grad_norm=5.377, loss_scale=1.000, learning_rate=7.797e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 19:54:48,338 (deepspeed_trainer:228) INFO: 27epoch:train:4801-4900batch: iter_time=1.210e-04, loss_ctc=68.910, loss_att=49.573, acc=0.720, loss=55.379, grad_norm=4.549, loss_scale=1.000, learning_rate=7.796e-05, step_time=0.326 [2024-12-07 19:55:21,023] [INFO] [logging.py:129:log_dist] [Rank 0] step=395000, skipped=0, lr=[np.float64(7.794827762675231e-05)], mom=[[0.9, 0.98]] [2024-12-07 19:55:21,024] [INFO] [timer.py:264:stop] epoch=0/micro_step=5000/global_step=5000, RunningAvgSamplesPerSec=49.80061065201819, CurrSamplesPerSec=49.38766958031043, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 19:55:21,026 (deepspeed_trainer:228) INFO: 27epoch:train:4901-5000batch: iter_time=1.226e-04, loss_ctc=74.541, loss_att=56.839, acc=0.716, loss=62.158, grad_norm=4.790, loss_scale=1.000, learning_rate=7.795e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 19:55:54,584 (deepspeed_trainer:228) INFO: 27epoch:train:5001-5100batch: iter_time=1.195e-04, loss_ctc=73.475, loss_att=58.807, acc=0.698, loss=63.217, grad_norm=4.489, loss_scale=1.000, learning_rate=7.794e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 19:56:27,414 (deepspeed_trainer:228) INFO: 27epoch:train:5101-5200batch: iter_time=1.190e-04, loss_ctc=74.600, loss_att=57.188, acc=0.710, loss=62.442, grad_norm=4.717, loss_scale=1.000, learning_rate=7.793e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 19:57:00,522 (deepspeed_trainer:228) INFO: 27epoch:train:5201-5300batch: iter_time=1.134e-04, loss_ctc=75.001, loss_att=56.249, acc=0.718, loss=61.854, grad_norm=5.046, loss_scale=1.000, learning_rate=7.792e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 19:57:33,384 (deepspeed_trainer:228) INFO: 27epoch:train:5301-5400batch: iter_time=1.147e-04, loss_ctc=69.836, loss_att=56.768, acc=0.701, loss=60.654, grad_norm=4.952, loss_scale=1.000, learning_rate=7.791e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 19:58:06,487 (deepspeed_trainer:228) INFO: 27epoch:train:5401-5500batch: iter_time=1.180e-04, loss_ctc=78.674, loss_att=63.322, acc=0.706, loss=67.929, grad_norm=5.178, loss_scale=1.000, learning_rate=7.790e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 19:58:39,754 (deepspeed_trainer:228) INFO: 27epoch:train:5501-5600batch: iter_time=1.162e-04, loss_ctc=70.787, loss_att=50.320, acc=0.719, loss=56.444, grad_norm=4.849, loss_scale=1.000, learning_rate=7.789e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 19:58:52,056 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 19:59:18,431 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 19:59:35,004 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 19:59:35,005 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 19:59:35,007 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 19:59:59,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:00,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:01,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:02,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:02,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:02,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:03,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:01,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:04,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:04,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:03,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:04,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:06,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:05,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:05,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:05,926] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:47,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:48,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:48,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:50,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:51,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:52,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:52,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:52,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:53,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:53,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:54,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:53,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:54,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:54,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:54,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:00:55,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:37,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:36,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:37,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:39,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:40,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:40,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:41,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:41,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:42,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:43,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:43,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:43,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:44,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:44,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:44,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:01:46,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:24,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:24,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:25,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:28,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:28,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:30,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:30,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:30,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:32,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:33,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:35,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:34,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:34,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:35,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:37,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:02:46,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 20:03:36,854 (deepspeed_trainer:228) INFO: 27epoch:train:5601-5700batch: iter_time=2.534, loss_ctc=74.553, loss_att=59.214, acc=0.710, loss=63.834, grad_norm=5.541, loss_scale=1.000, learning_rate=7.788e-05, step_time=0.437 [cnode7-012:0/16] 2024-12-07 20:04:10,452 (deepspeed_trainer:228) INFO: 27epoch:train:5701-5800batch: iter_time=1.118e-04, loss_ctc=69.933, loss_att=56.047, acc=0.712, loss=60.235, grad_norm=4.544, loss_scale=1.000, learning_rate=7.787e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:04:43,847 (deepspeed_trainer:228) INFO: 27epoch:train:5801-5900batch: iter_time=1.129e-04, loss_ctc=69.881, loss_att=51.440, acc=0.731, loss=56.962, grad_norm=5.259, loss_scale=1.000, learning_rate=7.786e-05, step_time=0.334 [2024-12-07 20:05:17,304] [INFO] [logging.py:129:log_dist] [Rank 0] step=396000, skipped=0, lr=[np.float64(7.784979612101723e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:05:17,305] [INFO] [timer.py:264:stop] epoch=0/micro_step=6000/global_step=6000, RunningAvgSamplesPerSec=49.50980550144728, CurrSamplesPerSec=49.90693272963333, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:05:17,307 (deepspeed_trainer:228) INFO: 27epoch:train:5901-6000batch: iter_time=1.134e-04, loss_ctc=73.728, loss_att=57.716, acc=0.709, loss=62.529, grad_norm=5.360, loss_scale=1.000, learning_rate=7.785e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:05:50,792 (deepspeed_trainer:228) INFO: 27epoch:train:6001-6100batch: iter_time=1.082e-04, loss_ctc=70.229, loss_att=51.612, acc=0.728, loss=57.195, grad_norm=4.546, loss_scale=1.000, learning_rate=7.784e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:06:24,165 (deepspeed_trainer:228) INFO: 27epoch:train:6101-6200batch: iter_time=1.146e-04, loss_ctc=76.408, loss_att=55.151, acc=0.715, loss=61.530, grad_norm=5.073, loss_scale=1.000, learning_rate=7.784e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:06:57,212 (deepspeed_trainer:228) INFO: 27epoch:train:6201-6300batch: iter_time=1.093e-04, loss_ctc=68.977, loss_att=52.885, acc=0.718, loss=57.673, grad_norm=5.166, loss_scale=1.000, learning_rate=7.783e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 20:07:30,478 (deepspeed_trainer:228) INFO: 27epoch:train:6301-6400batch: iter_time=1.160e-04, loss_ctc=81.140, loss_att=65.382, acc=0.704, loss=70.087, grad_norm=6.326, loss_scale=1.000, learning_rate=7.782e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 20:08:03,713 (deepspeed_trainer:228) INFO: 27epoch:train:6401-6500batch: iter_time=1.117e-04, loss_ctc=76.413, loss_att=55.595, acc=0.728, loss=61.843, grad_norm=4.891, loss_scale=1.000, learning_rate=7.781e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:08:36,831 (deepspeed_trainer:228) INFO: 27epoch:train:6501-6600batch: iter_time=1.095e-04, loss_ctc=73.998, loss_att=53.509, acc=0.727, loss=59.674, grad_norm=4.729, loss_scale=1.000, learning_rate=7.780e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:09:10,152 (deepspeed_trainer:228) INFO: 27epoch:train:6601-6700batch: iter_time=1.162e-04, loss_ctc=76.149, loss_att=64.168, acc=0.703, loss=67.777, grad_norm=5.051, loss_scale=1.000, learning_rate=7.779e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 20:09:43,503 (deepspeed_trainer:228) INFO: 27epoch:train:6701-6800batch: iter_time=1.109e-04, loss_ctc=77.700, loss_att=57.878, acc=0.715, loss=63.814, grad_norm=4.855, loss_scale=1.000, learning_rate=7.778e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 20:10:16,531 (deepspeed_trainer:228) INFO: 27epoch:train:6801-6900batch: iter_time=1.089e-04, loss_ctc=68.806, loss_att=50.800, acc=0.719, loss=56.213, grad_norm=4.502, loss_scale=1.000, learning_rate=7.777e-05, step_time=0.331 [2024-12-07 20:10:49,694] [INFO] [logging.py:129:log_dist] [Rank 0] step=397000, skipped=0, lr=[np.float64(7.775168694542511e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:10:49,695] [INFO] [timer.py:264:stop] epoch=0/micro_step=7000/global_step=7000, RunningAvgSamplesPerSec=49.52756819574419, CurrSamplesPerSec=52.80731315431144, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:10:49,697 (deepspeed_trainer:228) INFO: 27epoch:train:6901-7000batch: iter_time=1.092e-04, loss_ctc=75.430, loss_att=60.554, acc=0.720, loss=65.010, grad_norm=4.681, loss_scale=1.000, learning_rate=7.776e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:11:22,645 (deepspeed_trainer:228) INFO: 27epoch:train:7001-7100batch: iter_time=1.105e-04, loss_ctc=70.935, loss_att=53.379, acc=0.713, loss=58.639, grad_norm=4.982, loss_scale=1.000, learning_rate=7.775e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 20:11:55,527 (deepspeed_trainer:228) INFO: 27epoch:train:7101-7200batch: iter_time=1.071e-04, loss_ctc=76.927, loss_att=59.232, acc=0.715, loss=64.517, grad_norm=5.078, loss_scale=1.000, learning_rate=7.774e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 20:12:28,601 (deepspeed_trainer:228) INFO: 27epoch:train:7201-7300batch: iter_time=1.104e-04, loss_ctc=70.581, loss_att=60.235, acc=0.717, loss=63.323, grad_norm=4.770, loss_scale=1.000, learning_rate=7.773e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 20:13:01,448 (deepspeed_trainer:228) INFO: 27epoch:train:7301-7400batch: iter_time=1.097e-04, loss_ctc=76.732, loss_att=59.040, acc=0.710, loss=64.355, grad_norm=5.345, loss_scale=1.000, learning_rate=7.772e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 20:13:34,159 (deepspeed_trainer:228) INFO: 27epoch:train:7401-7500batch: iter_time=1.048e-04, loss_ctc=75.922, loss_att=52.991, acc=0.720, loss=59.860, grad_norm=5.035, loss_scale=1.000, learning_rate=7.771e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 20:13:38,914 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 20:14:05,282 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 20:14:21,509 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 20:14:21,509 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 20:14:21,512 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 20:14:46,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:47,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:47,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:46,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:48,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:50,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:50,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:48,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:50,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:50,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:49,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:51,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:49,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:50,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:50,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:14:53,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:33,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:35,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:36,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:35,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:38,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:37,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:38,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:37,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:39,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:39,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:39,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:39,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:39,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:39,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:40,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:15:47,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:20,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:22,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:23,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:23,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:26,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:26,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:25,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:27,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:26,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:27,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:28,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:28,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:29,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:29,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:30,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:16:39,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:07,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:09,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:11,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:10,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:12,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:15,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:16,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:16,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:16,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:16,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:16,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:17,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:17,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:19,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:22,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:17:29,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 20:18:28,468 (deepspeed_trainer:228) INFO: 27epoch:train:7501-7600batch: iter_time=2.514, loss_ctc=68.813, loss_att=56.862, acc=0.713, loss=60.438, grad_norm=4.914, loss_scale=1.000, learning_rate=7.770e-05, step_time=0.429 [cnode7-012:0/16] 2024-12-07 20:19:02,011 (deepspeed_trainer:228) INFO: 27epoch:train:7601-7700batch: iter_time=1.155e-04, loss_ctc=75.419, loss_att=58.517, acc=0.713, loss=63.591, grad_norm=5.009, loss_scale=1.000, learning_rate=7.769e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:19:35,427 (deepspeed_trainer:228) INFO: 27epoch:train:7701-7800batch: iter_time=1.085e-04, loss_ctc=66.820, loss_att=48.664, acc=0.737, loss=54.112, grad_norm=4.817, loss_scale=1.000, learning_rate=7.768e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:20:08,339 (deepspeed_trainer:228) INFO: 27epoch:train:7801-7900batch: iter_time=1.130e-04, loss_ctc=70.215, loss_att=53.880, acc=0.712, loss=58.793, grad_norm=5.373, loss_scale=1.000, learning_rate=7.767e-05, step_time=0.329 [2024-12-07 20:20:42,035] [INFO] [logging.py:129:log_dist] [Rank 0] step=398000, skipped=0, lr=[np.float64(7.765394775975247e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:20:42,036] [INFO] [timer.py:264:stop] epoch=0/micro_step=8000/global_step=8000, RunningAvgSamplesPerSec=49.376109449819296, CurrSamplesPerSec=50.7505739041557, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:20:42,039 (deepspeed_trainer:228) INFO: 27epoch:train:7901-8000batch: iter_time=1.159e-04, loss_ctc=75.236, loss_att=55.132, acc=0.723, loss=61.181, grad_norm=5.107, loss_scale=1.000, learning_rate=7.766e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 20:21:14,929 (deepspeed_trainer:228) INFO: 27epoch:train:8001-8100batch: iter_time=1.123e-04, loss_ctc=70.195, loss_att=48.656, acc=0.725, loss=55.107, grad_norm=4.655, loss_scale=1.000, learning_rate=7.765e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 20:21:48,086 (deepspeed_trainer:228) INFO: 27epoch:train:8101-8200batch: iter_time=1.133e-04, loss_ctc=75.182, loss_att=57.236, acc=0.716, loss=62.631, grad_norm=6.017, loss_scale=1.000, learning_rate=7.764e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:22:21,390 (deepspeed_trainer:228) INFO: 27epoch:train:8201-8300batch: iter_time=1.126e-04, loss_ctc=75.307, loss_att=59.768, acc=0.708, loss=64.433, grad_norm=5.263, loss_scale=1.000, learning_rate=7.763e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:22:54,832 (deepspeed_trainer:228) INFO: 27epoch:train:8301-8400batch: iter_time=1.154e-04, loss_ctc=80.231, loss_att=59.181, acc=0.727, loss=65.486, grad_norm=5.166, loss_scale=1.000, learning_rate=7.762e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:23:28,004 (deepspeed_trainer:228) INFO: 27epoch:train:8401-8500batch: iter_time=1.109e-04, loss_ctc=72.639, loss_att=55.410, acc=0.727, loss=60.587, grad_norm=4.842, loss_scale=1.000, learning_rate=7.761e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:24:01,378 (deepspeed_trainer:228) INFO: 27epoch:train:8501-8600batch: iter_time=1.146e-04, loss_ctc=73.272, loss_att=58.975, acc=0.709, loss=63.273, grad_norm=5.032, loss_scale=1.000, learning_rate=7.760e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 20:24:34,670 (deepspeed_trainer:228) INFO: 27epoch:train:8601-8700batch: iter_time=1.161e-04, loss_ctc=78.969, loss_att=60.666, acc=0.716, loss=66.128, grad_norm=5.042, loss_scale=1.000, learning_rate=7.759e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 20:25:07,888 (deepspeed_trainer:228) INFO: 27epoch:train:8701-8800batch: iter_time=1.121e-04, loss_ctc=71.043, loss_att=55.941, acc=0.711, loss=60.476, grad_norm=4.807, loss_scale=1.000, learning_rate=7.758e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:25:41,215 (deepspeed_trainer:228) INFO: 27epoch:train:8801-8900batch: iter_time=1.133e-04, loss_ctc=70.325, loss_att=51.897, acc=0.728, loss=57.408, grad_norm=4.409, loss_scale=1.000, learning_rate=7.757e-05, step_time=0.333 [2024-12-07 20:26:14,036] [INFO] [logging.py:129:log_dist] [Rank 0] step=399000, skipped=0, lr=[np.float64(7.7556576244317e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:26:14,037] [INFO] [timer.py:264:stop] epoch=0/micro_step=9000/global_step=9000, RunningAvgSamplesPerSec=49.412652569247726, CurrSamplesPerSec=53.10479944031686, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:26:14,039 (deepspeed_trainer:228) INFO: 27epoch:train:8901-9000batch: iter_time=1.139e-04, loss_ctc=72.233, loss_att=51.542, acc=0.721, loss=57.737, grad_norm=5.063, loss_scale=1.000, learning_rate=7.756e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 20:26:47,054 (deepspeed_trainer:228) INFO: 27epoch:train:9001-9100batch: iter_time=1.100e-04, loss_ctc=77.958, loss_att=62.947, acc=0.710, loss=67.499, grad_norm=4.930, loss_scale=1.000, learning_rate=7.755e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 20:27:20,127 (deepspeed_trainer:228) INFO: 27epoch:train:9101-9200batch: iter_time=1.123e-04, loss_ctc=72.461, loss_att=62.320, acc=0.707, loss=65.313, grad_norm=4.988, loss_scale=1.000, learning_rate=7.754e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 20:27:53,243 (deepspeed_trainer:228) INFO: 27epoch:train:9201-9300batch: iter_time=1.164e-04, loss_ctc=75.339, loss_att=56.323, acc=0.717, loss=62.044, grad_norm=4.798, loss_scale=1.000, learning_rate=7.753e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:28:22,313 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 20:28:49,553 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 20:29:05,987 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 20:29:05,987 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 20:29:05,989 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 20:29:30,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:31,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:31,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:35,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:35,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:34,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:35,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:35,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:35,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:29:35,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:19,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:20,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:21,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:23,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:23,581] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:23,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:24,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:24,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:24,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:24,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:24,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:25,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:25,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:25,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:27,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:30:29,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:07,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:09,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:11,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:11,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:12,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:12,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:13,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:13,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:13,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:13,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:13,603] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:14,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:15,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:15,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:18,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:20,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:56,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:31:59,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:01,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:02,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:02,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:03,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:03,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:03,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:03,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:02,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:03,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:04,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:04,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:05,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:06,751] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:32:17,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 20:32:51,222 (deepspeed_trainer:228) INFO: 27epoch:train:9301-9400batch: iter_time=2.566, loss_ctc=75.705, loss_att=56.460, acc=0.714, loss=62.210, grad_norm=6.109, loss_scale=1.000, learning_rate=7.752e-05, step_time=0.413 [cnode7-012:0/16] 2024-12-07 20:33:25,534 (deepspeed_trainer:228) INFO: 27epoch:train:9401-9500batch: iter_time=1.109e-04, loss_ctc=64.787, loss_att=49.888, acc=0.720, loss=54.364, grad_norm=4.574, loss_scale=1.000, learning_rate=7.751e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-07 20:33:59,851 (deepspeed_trainer:228) INFO: 27epoch:train:9501-9600batch: iter_time=1.158e-04, loss_ctc=73.989, loss_att=58.617, acc=0.718, loss=63.230, grad_norm=5.549, loss_scale=1.000, learning_rate=7.750e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-07 20:34:33,307 (deepspeed_trainer:228) INFO: 27epoch:train:9601-9700batch: iter_time=1.151e-04, loss_ctc=65.496, loss_att=48.153, acc=0.732, loss=53.361, grad_norm=4.336, loss_scale=1.000, learning_rate=7.749e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:35:07,178 (deepspeed_trainer:228) INFO: 27epoch:train:9701-9800batch: iter_time=1.168e-04, loss_ctc=73.998, loss_att=54.608, acc=0.723, loss=60.449, grad_norm=5.290, loss_scale=1.000, learning_rate=7.748e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-07 20:35:41,021 (deepspeed_trainer:228) INFO: 27epoch:train:9801-9900batch: iter_time=1.182e-04, loss_ctc=77.363, loss_att=55.543, acc=0.723, loss=62.111, grad_norm=4.475, loss_scale=1.000, learning_rate=7.747e-05, step_time=0.338 [2024-12-07 20:36:15,679] [INFO] [logging.py:129:log_dist] [Rank 0] step=400000, skipped=0, lr=[np.float64(7.745957009974624e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:36:15,683] [INFO] [timer.py:264:stop] epoch=0/micro_step=10000/global_step=10000, RunningAvgSamplesPerSec=49.24374333007177, CurrSamplesPerSec=48.48933706143378, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:36:15,686 (deepspeed_trainer:228) INFO: 27epoch:train:9901-10000batch: iter_time=1.126e-04, loss_ctc=67.422, loss_att=49.418, acc=0.722, loss=54.849, grad_norm=4.774, loss_scale=1.000, learning_rate=7.746e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:36:50,118 (deepspeed_trainer:228) INFO: 27epoch:train:10001-10100batch: iter_time=1.115e-04, loss_ctc=75.740, loss_att=57.595, acc=0.716, loss=63.046, grad_norm=5.746, loss_scale=1.000, learning_rate=7.745e-05, step_time=0.344 [cnode7-012:0/16] 2024-12-07 20:37:23,673 (deepspeed_trainer:228) INFO: 27epoch:train:10101-10200batch: iter_time=1.164e-04, loss_ctc=77.412, loss_att=61.695, acc=0.704, loss=66.405, grad_norm=4.958, loss_scale=1.000, learning_rate=7.745e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:37:57,080 (deepspeed_trainer:228) INFO: 27epoch:train:10201-10300batch: iter_time=1.156e-04, loss_ctc=77.865, loss_att=56.551, acc=0.730, loss=62.926, grad_norm=4.866, loss_scale=1.000, learning_rate=7.744e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:38:30,360 (deepspeed_trainer:228) INFO: 27epoch:train:10301-10400batch: iter_time=1.207e-04, loss_ctc=73.956, loss_att=57.480, acc=0.719, loss=62.429, grad_norm=5.248, loss_scale=1.000, learning_rate=7.743e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:39:03,557 (deepspeed_trainer:228) INFO: 27epoch:train:10401-10500batch: iter_time=1.117e-04, loss_ctc=72.523, loss_att=56.049, acc=0.715, loss=60.981, grad_norm=5.029, loss_scale=1.000, learning_rate=7.742e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:39:37,051 (deepspeed_trainer:228) INFO: 27epoch:train:10501-10600batch: iter_time=1.113e-04, loss_ctc=74.398, loss_att=56.780, acc=0.723, loss=62.084, grad_norm=4.823, loss_scale=1.000, learning_rate=7.741e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:40:10,500 (deepspeed_trainer:228) INFO: 27epoch:train:10601-10700batch: iter_time=1.123e-04, loss_ctc=72.987, loss_att=58.505, acc=0.709, loss=62.854, grad_norm=4.866, loss_scale=1.000, learning_rate=7.740e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:40:43,881 (deepspeed_trainer:228) INFO: 27epoch:train:10701-10800batch: iter_time=1.104e-04, loss_ctc=71.946, loss_att=55.938, acc=0.717, loss=60.754, grad_norm=4.696, loss_scale=1.000, learning_rate=7.739e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 20:41:16,888 (deepspeed_trainer:228) INFO: 27epoch:train:10801-10900batch: iter_time=1.096e-04, loss_ctc=71.399, loss_att=51.708, acc=0.725, loss=57.620, grad_norm=5.056, loss_scale=1.000, learning_rate=7.738e-05, step_time=0.330 [2024-12-07 20:41:49,851] [INFO] [logging.py:129:log_dist] [Rank 0] step=401000, skipped=0, lr=[np.float64(7.736292704674957e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:41:49,852] [INFO] [timer.py:264:stop] epoch=0/micro_step=11000/global_step=11000, RunningAvgSamplesPerSec=49.255386867885804, CurrSamplesPerSec=51.41621412803472, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:41:49,855 (deepspeed_trainer:228) INFO: 27epoch:train:10901-11000batch: iter_time=1.102e-04, loss_ctc=77.456, loss_att=64.293, acc=0.705, loss=68.244, grad_norm=5.398, loss_scale=1.000, learning_rate=7.737e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 20:42:22,956 (deepspeed_trainer:228) INFO: 27epoch:train:11001-11100batch: iter_time=1.116e-04, loss_ctc=70.293, loss_att=57.292, acc=0.715, loss=61.189, grad_norm=5.143, loss_scale=1.000, learning_rate=7.736e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:42:56,031 (deepspeed_trainer:228) INFO: 27epoch:train:11101-11200batch: iter_time=1.124e-04, loss_ctc=72.217, loss_att=53.193, acc=0.722, loss=58.900, grad_norm=5.137, loss_scale=1.000, learning_rate=7.735e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:43:17,210 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 20:43:43,963 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 20:44:00,873 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 20:44:00,873 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 20:44:00,875 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 20:44:25,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:26,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:27,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:27,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:27,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:27,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:27,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:28,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:28,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:28,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:28,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:28,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:29,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:29,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:30,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:44:30,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:13,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:14,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:15,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:16,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:16,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:16,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:17,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:17,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:17,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:17,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:18,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:18,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:19,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:19,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:19,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:45:21,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:01,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:02,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:04,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:04,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:05,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:05,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:06,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:06,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:06,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:08,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:08,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:08,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:09,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:09,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:10,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:11,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:48,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:50,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:52,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:53,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:53,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:54,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:55,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:55,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:55,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:57,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:57,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:58,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:58,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:46:58,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:47:00,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:47:01,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 20:47:46,295 (deepspeed_trainer:228) INFO: 27epoch:train:11201-11300batch: iter_time=2.539, loss_ctc=75.538, loss_att=55.350, acc=0.717, loss=61.399, grad_norm=5.960, loss_scale=1.000, learning_rate=7.734e-05, step_time=0.359 [cnode7-012:0/16] 2024-12-07 20:48:19,802 (deepspeed_trainer:228) INFO: 27epoch:train:11301-11400batch: iter_time=1.081e-04, loss_ctc=67.364, loss_att=54.573, acc=0.712, loss=58.428, grad_norm=4.612, loss_scale=1.000, learning_rate=7.733e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:48:53,410 (deepspeed_trainer:228) INFO: 27epoch:train:11401-11500batch: iter_time=1.123e-04, loss_ctc=72.472, loss_att=55.681, acc=0.723, loss=60.743, grad_norm=4.960, loss_scale=1.000, learning_rate=7.732e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 20:49:26,347 (deepspeed_trainer:228) INFO: 27epoch:train:11501-11600batch: iter_time=1.104e-04, loss_ctc=70.026, loss_att=53.212, acc=0.721, loss=58.255, grad_norm=5.068, loss_scale=1.000, learning_rate=7.731e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 20:49:59,217 (deepspeed_trainer:228) INFO: 27epoch:train:11601-11700batch: iter_time=1.085e-04, loss_ctc=70.875, loss_att=52.208, acc=0.718, loss=57.802, grad_norm=5.078, loss_scale=1.000, learning_rate=7.730e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 20:50:32,340 (deepspeed_trainer:228) INFO: 27epoch:train:11701-11800batch: iter_time=1.102e-04, loss_ctc=79.225, loss_att=57.697, acc=0.719, loss=64.154, grad_norm=5.083, loss_scale=1.000, learning_rate=7.729e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:51:05,328 (deepspeed_trainer:228) INFO: 27epoch:train:11801-11900batch: iter_time=1.127e-04, loss_ctc=59.937, loss_att=43.758, acc=0.717, loss=48.631, grad_norm=4.985, loss_scale=1.000, learning_rate=7.728e-05, step_time=0.330 [2024-12-07 20:51:38,775] [INFO] [logging.py:129:log_dist] [Rank 0] step=402000, skipped=0, lr=[np.float64(7.726664482589327e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:51:38,775] [INFO] [timer.py:264:stop] epoch=0/micro_step=12000/global_step=12000, RunningAvgSamplesPerSec=49.254251908737906, CurrSamplesPerSec=47.641323603125784, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:51:38,778 (deepspeed_trainer:228) INFO: 27epoch:train:11901-12000batch: iter_time=1.115e-04, loss_ctc=83.476, loss_att=66.211, acc=0.703, loss=71.389, grad_norm=5.857, loss_scale=1.000, learning_rate=7.727e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:52:11,998 (deepspeed_trainer:228) INFO: 27epoch:train:12001-12100batch: iter_time=1.131e-04, loss_ctc=76.879, loss_att=56.937, acc=0.716, loss=62.932, grad_norm=5.234, loss_scale=1.000, learning_rate=7.726e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 20:52:45,154 (deepspeed_trainer:228) INFO: 27epoch:train:12101-12200batch: iter_time=1.142e-04, loss_ctc=75.541, loss_att=55.905, acc=0.717, loss=61.798, grad_norm=4.959, loss_scale=1.000, learning_rate=7.725e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:53:18,323 (deepspeed_trainer:228) INFO: 27epoch:train:12201-12300batch: iter_time=1.102e-04, loss_ctc=76.721, loss_att=61.166, acc=0.708, loss=65.827, grad_norm=5.369, loss_scale=1.000, learning_rate=7.724e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 20:53:51,222 (deepspeed_trainer:228) INFO: 27epoch:train:12301-12400batch: iter_time=1.148e-04, loss_ctc=69.099, loss_att=49.202, acc=0.722, loss=55.162, grad_norm=4.841, loss_scale=1.000, learning_rate=7.723e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 20:54:24,686 (deepspeed_trainer:228) INFO: 27epoch:train:12401-12500batch: iter_time=1.126e-04, loss_ctc=73.812, loss_att=56.797, acc=0.717, loss=61.883, grad_norm=5.128, loss_scale=1.000, learning_rate=7.722e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:54:58,102 (deepspeed_trainer:228) INFO: 27epoch:train:12501-12600batch: iter_time=1.110e-04, loss_ctc=72.575, loss_att=58.816, acc=0.699, loss=62.920, grad_norm=4.766, loss_scale=1.000, learning_rate=7.721e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 20:55:31,674 (deepspeed_trainer:228) INFO: 27epoch:train:12601-12700batch: iter_time=1.106e-04, loss_ctc=74.142, loss_att=57.256, acc=0.711, loss=62.330, grad_norm=5.101, loss_scale=1.000, learning_rate=7.720e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 20:56:05,501 (deepspeed_trainer:228) INFO: 27epoch:train:12701-12800batch: iter_time=1.123e-04, loss_ctc=75.091, loss_att=56.361, acc=0.720, loss=61.989, grad_norm=5.306, loss_scale=1.000, learning_rate=7.719e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 20:56:38,412 (deepspeed_trainer:228) INFO: 27epoch:train:12801-12900batch: iter_time=1.119e-04, loss_ctc=69.233, loss_att=56.586, acc=0.704, loss=60.333, grad_norm=4.781, loss_scale=1.000, learning_rate=7.719e-05, step_time=0.329 [2024-12-07 20:57:11,356] [INFO] [logging.py:129:log_dist] [Rank 0] step=403000, skipped=0, lr=[np.float64(7.717072119737867e-05)], mom=[[0.9, 0.98]] [2024-12-07 20:57:11,357] [INFO] [timer.py:264:stop] epoch=0/micro_step=13000/global_step=13000, RunningAvgSamplesPerSec=49.28304908767614, CurrSamplesPerSec=52.778200166559394, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 20:57:11,359 (deepspeed_trainer:228) INFO: 27epoch:train:12901-13000batch: iter_time=1.102e-04, loss_ctc=78.353, loss_att=63.302, acc=0.708, loss=67.789, grad_norm=5.300, loss_scale=1.000, learning_rate=7.718e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 20:57:44,057 (deepspeed_trainer:228) INFO: 27epoch:train:13001-13100batch: iter_time=1.105e-04, loss_ctc=70.792, loss_att=50.380, acc=0.720, loss=56.504, grad_norm=4.856, loss_scale=1.000, learning_rate=7.717e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 20:57:56,930 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 20:58:24,590 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 20:58:40,630 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 20:58:40,630 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 20:58:40,632 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 20:59:04,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:04,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:04,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:04,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:04,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:04,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:06,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:06,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:06,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:06,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:07,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:08,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:09,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:09,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:09,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:11,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:51,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:53,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:53,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:53,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:53,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:54,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:54,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:55,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:55,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:56,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:56,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:58,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:58,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 20:59:59,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:00,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:00,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:39,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:39,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:40,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:41,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:42,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:42,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:42,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:43,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:44,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:44,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:45,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:47,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:47,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:48,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:48,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:00:50,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:26,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:28,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:28,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:29,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:31,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:31,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:32,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:32,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:32,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:33,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:33,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:37,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:37,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:37,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:38,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:01:39,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 21:02:31,990 (deepspeed_trainer:228) INFO: 27epoch:train:13101-13200batch: iter_time=2.534, loss_ctc=73.073, loss_att=57.553, acc=0.716, loss=62.226, grad_norm=5.500, loss_scale=1.000, learning_rate=7.716e-05, step_time=0.353 [cnode7-012:0/16] 2024-12-07 21:03:06,605 (deepspeed_trainer:228) INFO: 27epoch:train:13201-13300batch: iter_time=1.096e-04, loss_ctc=69.386, loss_att=55.837, acc=0.714, loss=59.921, grad_norm=5.234, loss_scale=1.000, learning_rate=7.715e-05, step_time=0.345 [cnode7-012:0/16] 2024-12-07 21:03:40,256 (deepspeed_trainer:228) INFO: 27epoch:train:13301-13400batch: iter_time=1.099e-04, loss_ctc=69.109, loss_att=50.527, acc=0.735, loss=56.107, grad_norm=5.386, loss_scale=1.000, learning_rate=7.714e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 21:04:13,335 (deepspeed_trainer:228) INFO: 27epoch:train:13401-13500batch: iter_time=1.118e-04, loss_ctc=72.952, loss_att=56.735, acc=0.713, loss=61.599, grad_norm=5.365, loss_scale=1.000, learning_rate=7.713e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 21:04:46,341 (deepspeed_trainer:228) INFO: 27epoch:train:13501-13600batch: iter_time=1.111e-04, loss_ctc=70.334, loss_att=51.567, acc=0.730, loss=57.183, grad_norm=4.802, loss_scale=1.000, learning_rate=7.712e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 21:05:19,344 (deepspeed_trainer:228) INFO: 27epoch:train:13601-13700batch: iter_time=1.094e-04, loss_ctc=75.658, loss_att=54.442, acc=0.716, loss=60.797, grad_norm=5.220, loss_scale=1.000, learning_rate=7.711e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 21:05:52,298 (deepspeed_trainer:228) INFO: 27epoch:train:13701-13800batch: iter_time=1.129e-04, loss_ctc=68.247, loss_att=51.765, acc=0.722, loss=56.712, grad_norm=4.853, loss_scale=1.000, learning_rate=7.710e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:06:25,483 (deepspeed_trainer:228) INFO: 27epoch:train:13801-13900batch: iter_time=1.121e-04, loss_ctc=81.473, loss_att=64.875, acc=0.705, loss=69.863, grad_norm=5.921, loss_scale=1.000, learning_rate=7.709e-05, step_time=0.332 [2024-12-07 21:06:58,613] [INFO] [logging.py:129:log_dist] [Rank 0] step=404000, skipped=0, lr=[np.float64(7.707515394082323e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:06:58,614] [INFO] [timer.py:264:stop] epoch=0/micro_step=14000/global_step=14000, RunningAvgSamplesPerSec=49.28333487481906, CurrSamplesPerSec=50.70413856495474, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 21:06:58,618 (deepspeed_trainer:228) INFO: 27epoch:train:13901-14000batch: iter_time=1.127e-04, loss_ctc=75.779, loss_att=55.030, acc=0.729, loss=61.248, grad_norm=5.372, loss_scale=1.000, learning_rate=7.708e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 21:07:31,459 (deepspeed_trainer:228) INFO: 27epoch:train:14001-14100batch: iter_time=1.125e-04, loss_ctc=72.805, loss_att=52.540, acc=0.730, loss=58.638, grad_norm=4.827, loss_scale=1.000, learning_rate=7.707e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:08:04,317 (deepspeed_trainer:228) INFO: 27epoch:train:14101-14200batch: iter_time=1.119e-04, loss_ctc=75.903, loss_att=64.586, acc=0.701, loss=67.984, grad_norm=5.045, loss_scale=1.000, learning_rate=7.706e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:08:37,088 (deepspeed_trainer:228) INFO: 27epoch:train:14201-14300batch: iter_time=1.103e-04, loss_ctc=76.607, loss_att=56.966, acc=0.717, loss=62.880, grad_norm=4.856, loss_scale=1.000, learning_rate=7.705e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 21:09:09,689 (deepspeed_trainer:228) INFO: 27epoch:train:14301-14400batch: iter_time=1.106e-04, loss_ctc=67.772, loss_att=50.608, acc=0.720, loss=55.767, grad_norm=4.922, loss_scale=1.000, learning_rate=7.704e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 21:09:43,121 (deepspeed_trainer:228) INFO: 27epoch:train:14401-14500batch: iter_time=1.096e-04, loss_ctc=75.387, loss_att=60.807, acc=0.720, loss=65.177, grad_norm=4.538, loss_scale=1.000, learning_rate=7.703e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:10:15,845 (deepspeed_trainer:228) INFO: 27epoch:train:14501-14600batch: iter_time=1.119e-04, loss_ctc=70.901, loss_att=53.213, acc=0.717, loss=58.528, grad_norm=5.087, loss_scale=1.000, learning_rate=7.702e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 21:10:48,740 (deepspeed_trainer:228) INFO: 27epoch:train:14601-14700batch: iter_time=1.109e-04, loss_ctc=76.019, loss_att=58.870, acc=0.716, loss=64.030, grad_norm=5.114, loss_scale=1.000, learning_rate=7.701e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 21:11:21,611 (deepspeed_trainer:228) INFO: 27epoch:train:14701-14800batch: iter_time=1.096e-04, loss_ctc=69.838, loss_att=59.320, acc=0.718, loss=62.490, grad_norm=4.757, loss_scale=1.000, learning_rate=7.700e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:11:54,527 (deepspeed_trainer:228) INFO: 27epoch:train:14801-14900batch: iter_time=1.104e-04, loss_ctc=74.650, loss_att=57.912, acc=0.714, loss=62.927, grad_norm=5.054, loss_scale=1.000, learning_rate=7.699e-05, step_time=0.329 [2024-12-07 21:12:27,159] [INFO] [logging.py:129:log_dist] [Rank 0] step=405000, skipped=0, lr=[np.float64(7.697994085504475e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:12:27,159] [INFO] [timer.py:264:stop] epoch=0/micro_step=15000/global_step=15000, RunningAvgSamplesPerSec=49.34497336118139, CurrSamplesPerSec=54.112799000832716, MemAllocated=1.77GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 21:12:27,162 (deepspeed_trainer:228) INFO: 27epoch:train:14901-15000batch: iter_time=1.097e-04, loss_ctc=74.737, loss_att=52.045, acc=0.722, loss=58.862, grad_norm=5.001, loss_scale=1.000, learning_rate=7.698e-05, step_time=0.326 [2024-12-07 21:12:42,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:42,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:43,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:43,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:43,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:44,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:57,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:59,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:59,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:59,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:59,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:12:59,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:00,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:01,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:01,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:01,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:12,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:14,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:14,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:14,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:15,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:15,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:15,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:15,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:15,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:16,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:16,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:16,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:16,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:16,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:17,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:17,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:27,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:30,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:30,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:30,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:30,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:31,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:31,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:31,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:31,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:32,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:13:45,293] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 27 is about to be saved! [2024-12-07 21:13:45,323] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/mp_rank_00_model_states.pt [2024-12-07 21:13:45,324] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/mp_rank_00_model_states.pt... [2024-12-07 21:13:47,188] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/mp_rank_00_model_states.pt. [2024-12-07 21:13:47,343] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,344] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,417] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,417] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,418] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,418] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,346] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,418] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,418] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,346] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,419] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,348] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,422] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 21:13:47,940] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 21:13:47,940] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 21:13:47,940] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,022] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,022] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,022] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,036] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,037] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,056] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,056] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,056] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,067] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,067] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,067] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,080] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,080] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,080] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,139] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,139] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,139] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,146] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,146] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,146] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,084] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,084] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,085] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,089] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,089] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,089] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,090] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,090] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,090] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,094] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,095] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,095] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,096] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,096] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,096] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,111] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,111] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,111] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,133] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,149] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,149] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [2024-12-07 21:13:48,240] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 21:13:48,240] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_27/27/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 21:13:48,240] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 27 is ready now! [cnode7-012:0/16] 2024-12-07 21:13:48,285 (deepspeed_trainer:158) INFO: 27epoch results: [train] iter_time=0.135, loss_ctc=73.745, loss_att=56.243, acc=0.715, loss=61.494, grad_norm=5.090, loss_scale=1.000, learning_rate=7.771e-05, step_time=0.335, time=1 hour, 57 minutes and 41.75 seconds, total_count=405027, gpu_max_cached_mem_GB=37.916, [valid] loss_ctc=4.750, cer_ctc=0.091, loss_att=8.438, acc=0.771, cer=0.464, wer=1.000, loss=7.312, time=1 minute and 8.19 seconds, total_count=27, gpu_max_cached_mem_GB=37.916 [cnode7-012:0/16] 2024-12-07 21:13:51,962 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 21:14:18,879 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 21:14:35,114 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 21:14:35,114 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 21:14:35,117 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 21:14:53,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:53,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:54,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:54,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:54,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:55,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:55,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:55,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:55,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:55,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:56,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:57,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:57,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:57,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:14:57,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:01,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:39,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:41,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:42,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:42,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:42,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:42,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:43,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:43,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:43,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:43,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:44,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:45,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:46,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:46,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:46,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:15:49,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:27,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:28,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:29,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:30,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:30,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:30,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:31,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:31,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:32,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:32,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:33,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:34,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:35,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:35,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:36,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:16:38,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:14,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:16,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:16,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:17,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:18,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:19,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:19,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:20,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:20,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:21,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:22,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:22,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:24,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:24,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:24,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:17:26,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) /mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py:279: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead. with autocast(False): [cnode7-012:0/16] 2024-12-07 21:18:27,817 (deepspeed_trainer:228) INFO: 28epoch:train:1-100batch: iter_time=2.419, loss_ctc=72.562, loss_att=54.167, acc=0.716, loss=59.681, grad_norm=5.247, loss_scale=1.000, learning_rate=7.698e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-07 21:19:01,948 (deepspeed_trainer:228) INFO: 28epoch:train:101-200batch: iter_time=1.176e-04, loss_ctc=79.637, loss_att=63.444, acc=0.712, loss=68.322, grad_norm=5.102, loss_scale=1.000, learning_rate=7.697e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 21:19:35,440 (deepspeed_trainer:228) INFO: 28epoch:train:201-300batch: iter_time=1.109e-04, loss_ctc=80.717, loss_att=56.968, acc=0.701, loss=64.106, grad_norm=5.753, loss_scale=1.000, learning_rate=7.696e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 21:20:08,871 (deepspeed_trainer:228) INFO: 28epoch:train:301-400batch: iter_time=1.173e-04, loss_ctc=70.754, loss_att=54.947, acc=0.710, loss=59.696, grad_norm=4.735, loss_scale=1.000, learning_rate=7.695e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:20:42,332 (deepspeed_trainer:228) INFO: 28epoch:train:401-500batch: iter_time=1.125e-04, loss_ctc=79.341, loss_att=68.372, acc=0.705, loss=71.675, grad_norm=5.321, loss_scale=1.000, learning_rate=7.694e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:21:15,867 (deepspeed_trainer:228) INFO: 28epoch:train:501-600batch: iter_time=1.177e-04, loss_ctc=82.851, loss_att=62.771, acc=0.710, loss=68.813, grad_norm=5.664, loss_scale=1.000, learning_rate=7.693e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:21:49,282 (deepspeed_trainer:228) INFO: 28epoch:train:601-700batch: iter_time=1.120e-04, loss_ctc=66.672, loss_att=52.373, acc=0.711, loss=56.673, grad_norm=4.967, loss_scale=1.000, learning_rate=7.692e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:22:22,717 (deepspeed_trainer:228) INFO: 28epoch:train:701-800batch: iter_time=1.138e-04, loss_ctc=86.038, loss_att=63.430, acc=0.705, loss=70.202, grad_norm=5.992, loss_scale=1.000, learning_rate=7.691e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:22:56,185 (deepspeed_trainer:228) INFO: 28epoch:train:801-900batch: iter_time=1.148e-04, loss_ctc=72.184, loss_att=53.861, acc=0.711, loss=59.369, grad_norm=5.658, loss_scale=1.000, learning_rate=7.690e-05, step_time=0.334 [2024-12-07 21:23:29,437] [INFO] [logging.py:129:log_dist] [Rank 0] step=406000, skipped=0, lr=[np.float64(7.688507975784844e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:23:29,437] [INFO] [timer.py:264:stop] epoch=0/micro_step=16000/global_step=16000, RunningAvgSamplesPerSec=49.33499219176047, CurrSamplesPerSec=52.27564077322992, MemAllocated=2.04GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 21:23:29,440 (deepspeed_trainer:228) INFO: 28epoch:train:901-1000batch: iter_time=1.190e-04, loss_ctc=73.385, loss_att=58.257, acc=0.708, loss=62.789, grad_norm=6.249, loss_scale=1.000, learning_rate=7.689e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 21:24:02,698 (deepspeed_trainer:228) INFO: 28epoch:train:1001-1100batch: iter_time=1.122e-04, loss_ctc=66.231, loss_att=52.452, acc=0.712, loss=56.573, grad_norm=5.366, loss_scale=1.000, learning_rate=7.688e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 21:24:36,240 (deepspeed_trainer:228) INFO: 28epoch:train:1101-1200batch: iter_time=1.106e-04, loss_ctc=76.789, loss_att=53.439, acc=0.716, loss=60.441, grad_norm=5.064, loss_scale=1.000, learning_rate=7.687e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 21:25:09,720 (deepspeed_trainer:228) INFO: 28epoch:train:1201-1300batch: iter_time=1.116e-04, loss_ctc=73.668, loss_att=56.048, acc=0.717, loss=61.327, grad_norm=5.024, loss_scale=1.000, learning_rate=7.686e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:25:43,038 (deepspeed_trainer:228) INFO: 28epoch:train:1301-1400batch: iter_time=1.121e-04, loss_ctc=69.394, loss_att=50.393, acc=0.718, loss=56.089, grad_norm=4.844, loss_scale=1.000, learning_rate=7.685e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:26:16,361 (deepspeed_trainer:228) INFO: 28epoch:train:1401-1500batch: iter_time=1.099e-04, loss_ctc=78.176, loss_att=58.881, acc=0.705, loss=64.676, grad_norm=5.843, loss_scale=1.000, learning_rate=7.684e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:26:49,722 (deepspeed_trainer:228) INFO: 28epoch:train:1501-1600batch: iter_time=1.154e-04, loss_ctc=70.711, loss_att=57.448, acc=0.718, loss=61.428, grad_norm=4.918, loss_scale=1.000, learning_rate=7.683e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:27:23,574 (deepspeed_trainer:228) INFO: 28epoch:train:1601-1700batch: iter_time=1.310e-04, loss_ctc=75.485, loss_att=58.098, acc=0.727, loss=63.307, grad_norm=5.246, loss_scale=1.000, learning_rate=7.682e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 21:27:57,014 (deepspeed_trainer:228) INFO: 28epoch:train:1701-1800batch: iter_time=1.113e-04, loss_ctc=69.911, loss_att=53.162, acc=0.721, loss=58.183, grad_norm=4.573, loss_scale=1.000, learning_rate=7.681e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:28:26,960 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 21:28:53,543 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 21:29:09,508 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 21:29:09,508 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 21:29:09,511 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 21:29:35,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:35,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:36,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:36,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:36,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:36,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:36,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:36,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:37,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:37,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:37,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:39,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:39,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:40,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:40,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:29:40,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:24,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:25,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:25,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:25,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:26,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:27,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:27,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:28,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:27,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:28,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:28,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:29,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:28,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:29,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:30,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:30:30,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:12,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:13,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:14,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:14,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:16,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:16,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:16,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:17,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:18,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:17,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:18,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:18,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:18,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:18,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:18,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:21,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:31:59,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:01,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:02,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:04,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:06,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:06,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:06,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:06,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:07,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:07,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:08,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:08,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:08,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:09,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:10,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:32:11,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 21:32:46,122 (deepspeed_trainer:228) INFO: 28epoch:train:1801-1900batch: iter_time=2.535, loss_ctc=73.112, loss_att=52.553, acc=0.713, loss=58.707, grad_norm=5.352, loss_scale=1.000, learning_rate=7.680e-05, step_time=0.356 [2024-12-07 21:33:19,981] [INFO] [logging.py:129:log_dist] [Rank 0] step=407000, skipped=0, lr=[np.float64(7.679056848581679e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:33:19,982] [INFO] [timer.py:264:stop] epoch=0/micro_step=17000/global_step=17000, RunningAvgSamplesPerSec=49.31406617871564, CurrSamplesPerSec=47.48356498181457, MemAllocated=2.04GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 21:33:19,985 (deepspeed_trainer:228) INFO: 28epoch:train:1901-2000batch: iter_time=1.150e-04, loss_ctc=78.652, loss_att=60.576, acc=0.708, loss=65.984, grad_norm=5.370, loss_scale=1.000, learning_rate=7.680e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 21:33:53,935 (deepspeed_trainer:228) INFO: 28epoch:train:2001-2100batch: iter_time=1.128e-04, loss_ctc=71.457, loss_att=58.492, acc=0.706, loss=62.369, grad_norm=4.767, loss_scale=1.000, learning_rate=7.679e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-07 21:34:27,423 (deepspeed_trainer:228) INFO: 28epoch:train:2101-2200batch: iter_time=1.118e-04, loss_ctc=78.767, loss_att=56.947, acc=0.696, loss=63.478, grad_norm=5.469, loss_scale=1.000, learning_rate=7.678e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:35:00,933 (deepspeed_trainer:228) INFO: 28epoch:train:2201-2300batch: iter_time=1.087e-04, loss_ctc=73.919, loss_att=60.913, acc=0.707, loss=64.828, grad_norm=4.530, loss_scale=1.000, learning_rate=7.677e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 21:35:34,560 (deepspeed_trainer:228) INFO: 28epoch:train:2301-2400batch: iter_time=1.085e-04, loss_ctc=83.044, loss_att=69.999, acc=0.700, loss=73.903, grad_norm=5.258, loss_scale=1.000, learning_rate=7.676e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 21:36:07,597 (deepspeed_trainer:228) INFO: 28epoch:train:2401-2500batch: iter_time=1.102e-04, loss_ctc=76.104, loss_att=56.931, acc=0.714, loss=62.696, grad_norm=5.605, loss_scale=1.000, learning_rate=7.675e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 21:36:40,247 (deepspeed_trainer:228) INFO: 28epoch:train:2501-2600batch: iter_time=1.062e-04, loss_ctc=69.353, loss_att=55.660, acc=0.701, loss=59.736, grad_norm=5.243, loss_scale=1.000, learning_rate=7.674e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 21:37:13,305 (deepspeed_trainer:228) INFO: 28epoch:train:2601-2700batch: iter_time=1.100e-04, loss_ctc=82.910, loss_att=57.954, acc=0.707, loss=65.406, grad_norm=6.134, loss_scale=1.000, learning_rate=7.673e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 21:37:46,731 (deepspeed_trainer:228) INFO: 28epoch:train:2701-2800batch: iter_time=1.089e-04, loss_ctc=68.662, loss_att=51.129, acc=0.714, loss=56.371, grad_norm=5.014, loss_scale=1.000, learning_rate=7.672e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:38:19,633 (deepspeed_trainer:228) INFO: 28epoch:train:2801-2900batch: iter_time=1.086e-04, loss_ctc=72.899, loss_att=58.422, acc=0.702, loss=62.785, grad_norm=5.688, loss_scale=1.000, learning_rate=7.671e-05, step_time=0.329 [2024-12-07 21:38:52,425] [INFO] [logging.py:129:log_dist] [Rank 0] step=408000, skipped=0, lr=[np.float64(7.669640489410236e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:38:52,426] [INFO] [timer.py:264:stop] epoch=0/micro_step=18000/global_step=18000, RunningAvgSamplesPerSec=49.32910085363819, CurrSamplesPerSec=52.6885713564299, MemAllocated=2.04GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 21:38:52,428 (deepspeed_trainer:228) INFO: 28epoch:train:2901-3000batch: iter_time=1.070e-04, loss_ctc=64.281, loss_att=51.257, acc=0.709, loss=55.176, grad_norm=5.154, loss_scale=1.000, learning_rate=7.670e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:39:25,322 (deepspeed_trainer:228) INFO: 28epoch:train:3001-3100batch: iter_time=1.108e-04, loss_ctc=80.127, loss_att=55.063, acc=0.712, loss=62.589, grad_norm=5.395, loss_scale=1.000, learning_rate=7.669e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:39:57,986 (deepspeed_trainer:228) INFO: 28epoch:train:3101-3200batch: iter_time=1.087e-04, loss_ctc=70.661, loss_att=51.548, acc=0.724, loss=57.273, grad_norm=4.985, loss_scale=1.000, learning_rate=7.668e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 21:40:30,507 (deepspeed_trainer:228) INFO: 28epoch:train:3201-3300batch: iter_time=1.078e-04, loss_ctc=71.947, loss_att=53.329, acc=0.706, loss=58.914, grad_norm=5.416, loss_scale=1.000, learning_rate=7.667e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 21:41:03,224 (deepspeed_trainer:228) INFO: 28epoch:train:3301-3400batch: iter_time=1.087e-04, loss_ctc=72.896, loss_att=54.381, acc=0.704, loss=59.930, grad_norm=5.232, loss_scale=1.000, learning_rate=7.666e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 21:41:36,283 (deepspeed_trainer:228) INFO: 28epoch:train:3401-3500batch: iter_time=1.080e-04, loss_ctc=73.799, loss_att=61.007, acc=0.713, loss=64.844, grad_norm=4.742, loss_scale=1.000, learning_rate=7.665e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 21:42:09,155 (deepspeed_trainer:228) INFO: 28epoch:train:3501-3600batch: iter_time=1.071e-04, loss_ctc=70.108, loss_att=53.059, acc=0.726, loss=58.175, grad_norm=4.848, loss_scale=1.000, learning_rate=7.664e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:42:42,045 (deepspeed_trainer:228) INFO: 28epoch:train:3601-3700batch: iter_time=1.121e-04, loss_ctc=72.743, loss_att=55.612, acc=0.706, loss=60.780, grad_norm=4.896, loss_scale=1.000, learning_rate=7.664e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 21:43:03,345 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 21:43:29,588 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 21:43:47,391 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 21:43:47,391 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 21:43:47,394 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 21:44:11,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:11,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:11,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:12,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:12,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:12,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:13,533] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:14,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:15,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:15,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:15,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:15,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:15,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:17,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:17,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:17,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:58,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:59,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:44:59,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:01,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:02,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:02,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:03,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:03,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:04,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:04,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:05,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:05,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:05,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:06,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:07,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:07,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:46,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:46,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:47,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:49,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:50,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:50,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:51,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:51,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:52,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:52,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:52,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:53,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:53,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:55,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:56,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:45:56,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:35,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:35,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:35,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:38,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:39,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:39,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:38,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:39,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:40,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:40,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:42,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:43,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:46,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:46,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:45,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:46:48,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 21:47:31,944 (deepspeed_trainer:228) INFO: 28epoch:train:3701-3800batch: iter_time=2.534, loss_ctc=76.711, loss_att=55.668, acc=0.707, loss=62.001, grad_norm=5.580, loss_scale=1.000, learning_rate=7.663e-05, step_time=0.357 [cnode7-012:0/16] 2024-12-07 21:48:05,760 (deepspeed_trainer:228) INFO: 28epoch:train:3801-3900batch: iter_time=1.080e-04, loss_ctc=75.109, loss_att=59.598, acc=0.720, loss=64.271, grad_norm=4.530, loss_scale=1.000, learning_rate=7.662e-05, step_time=0.337 [2024-12-07 21:48:39,349] [INFO] [logging.py:129:log_dist] [Rank 0] step=409000, skipped=0, lr=[np.float64(7.66025868562234e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:48:39,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=19000/global_step=19000, RunningAvgSamplesPerSec=49.34075855243375, CurrSamplesPerSec=50.956507486388794, MemAllocated=2.04GB, MaxMemAllocated=27.32GB [cnode7-012:0/16] 2024-12-07 21:48:39,352 (deepspeed_trainer:228) INFO: 28epoch:train:3901-4000batch: iter_time=1.114e-04, loss_ctc=74.374, loss_att=57.123, acc=0.716, loss=62.304, grad_norm=4.742, loss_scale=1.000, learning_rate=7.661e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 21:49:12,718 (deepspeed_trainer:228) INFO: 28epoch:train:4001-4100batch: iter_time=1.085e-04, loss_ctc=73.386, loss_att=54.477, acc=0.699, loss=60.120, grad_norm=5.530, loss_scale=1.000, learning_rate=7.660e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:49:46,389 (deepspeed_trainer:228) INFO: 28epoch:train:4101-4200batch: iter_time=1.070e-04, loss_ctc=74.566, loss_att=60.515, acc=0.721, loss=64.705, grad_norm=5.210, loss_scale=1.000, learning_rate=7.659e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 21:50:20,170 (deepspeed_trainer:228) INFO: 28epoch:train:4201-4300batch: iter_time=1.090e-04, loss_ctc=81.556, loss_att=74.268, acc=0.697, loss=76.461, grad_norm=5.423, loss_scale=1.000, learning_rate=7.658e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 21:50:53,512 (deepspeed_trainer:228) INFO: 28epoch:train:4301-4400batch: iter_time=1.076e-04, loss_ctc=70.944, loss_att=52.459, acc=0.718, loss=57.970, grad_norm=5.812, loss_scale=1.000, learning_rate=7.657e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:51:27,294 (deepspeed_trainer:228) INFO: 28epoch:train:4401-4500batch: iter_time=1.108e-04, loss_ctc=74.014, loss_att=57.962, acc=0.707, loss=62.788, grad_norm=5.097, loss_scale=1.000, learning_rate=7.656e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 21:52:00,601 (deepspeed_trainer:228) INFO: 28epoch:train:4501-4600batch: iter_time=1.097e-04, loss_ctc=79.610, loss_att=56.837, acc=0.714, loss=63.626, grad_norm=6.236, loss_scale=1.000, learning_rate=7.655e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:52:34,026 (deepspeed_trainer:228) INFO: 28epoch:train:4601-4700batch: iter_time=1.087e-04, loss_ctc=67.004, loss_att=52.328, acc=0.718, loss=56.709, grad_norm=4.943, loss_scale=1.000, learning_rate=7.654e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:53:07,507 (deepspeed_trainer:228) INFO: 28epoch:train:4701-4800batch: iter_time=1.071e-04, loss_ctc=72.934, loss_att=60.050, acc=0.701, loss=63.942, grad_norm=6.650, loss_scale=1.000, learning_rate=7.653e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:53:40,841 (deepspeed_trainer:228) INFO: 28epoch:train:4801-4900batch: iter_time=1.088e-04, loss_ctc=67.896, loss_att=50.537, acc=0.722, loss=55.756, grad_norm=4.852, loss_scale=1.000, learning_rate=7.652e-05, step_time=0.333 [2024-12-07 21:54:14,233] [INFO] [logging.py:129:log_dist] [Rank 0] step=410000, skipped=0, lr=[np.float64(7.650911226386205e-05)], mom=[[0.9, 0.98]] [2024-12-07 21:54:14,233] [INFO] [timer.py:264:stop] epoch=0/micro_step=20000/global_step=20000, RunningAvgSamplesPerSec=49.33461628295222, CurrSamplesPerSec=50.02240221981249, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 21:54:14,236 (deepspeed_trainer:228) INFO: 28epoch:train:4901-5000batch: iter_time=1.085e-04, loss_ctc=73.720, loss_att=50.344, acc=0.722, loss=57.359, grad_norm=4.934, loss_scale=1.000, learning_rate=7.651e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 21:54:48,524 (deepspeed_trainer:228) INFO: 28epoch:train:5001-5100batch: iter_time=1.070e-04, loss_ctc=77.847, loss_att=60.682, acc=0.718, loss=65.830, grad_norm=4.816, loss_scale=1.000, learning_rate=7.650e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-07 21:55:22,257 (deepspeed_trainer:228) INFO: 28epoch:train:5101-5200batch: iter_time=1.095e-04, loss_ctc=70.440, loss_att=51.792, acc=0.718, loss=57.394, grad_norm=5.291, loss_scale=1.000, learning_rate=7.650e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 21:55:55,858 (deepspeed_trainer:228) INFO: 28epoch:train:5201-5300batch: iter_time=1.105e-04, loss_ctc=73.477, loss_att=58.845, acc=0.707, loss=63.216, grad_norm=4.972, loss_scale=1.000, learning_rate=7.649e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 21:56:29,131 (deepspeed_trainer:228) INFO: 28epoch:train:5301-5400batch: iter_time=1.120e-04, loss_ctc=69.542, loss_att=56.646, acc=0.727, loss=60.502, grad_norm=4.976, loss_scale=1.000, learning_rate=7.648e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 21:57:02,282 (deepspeed_trainer:228) INFO: 28epoch:train:5401-5500batch: iter_time=1.116e-04, loss_ctc=67.906, loss_att=53.115, acc=0.727, loss=57.583, grad_norm=4.672, loss_scale=1.000, learning_rate=7.647e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 21:57:35,556 (deepspeed_trainer:228) INFO: 28epoch:train:5501-5600batch: iter_time=1.077e-04, loss_ctc=74.327, loss_att=55.478, acc=0.714, loss=61.136, grad_norm=5.636, loss_scale=1.000, learning_rate=7.646e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 21:57:48,294 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 21:58:14,524 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 21:58:31,739 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 21:58:31,739 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 21:58:31,741 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 21:58:54,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:57,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:57,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:57,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:58,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:58,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:59,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:59,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:58:59,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:00,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:00,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:01,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:01,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:00,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:02,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:02,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:43,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:44,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:45,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:46,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:46,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:49,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:50,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:49,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:49,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:50,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:50,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:51,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:50,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:51,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:51,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 21:59:51,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:31,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:34,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:34,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:34,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:38,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:38,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:38,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:39,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:39,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:39,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:40,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:40,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:41,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:41,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:42,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:00:42,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:20,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:23,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:23,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:26,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:27,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:27,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:28,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:29,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:29,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:31,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:31,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:31,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:32,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:32,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:32,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:01:33,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 22:02:25,574 (deepspeed_trainer:228) INFO: 28epoch:train:5601-5700batch: iter_time=2.558, loss_ctc=76.730, loss_att=56.046, acc=0.720, loss=62.245, grad_norm=5.772, loss_scale=1.000, learning_rate=7.645e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-07 22:02:59,637 (deepspeed_trainer:228) INFO: 28epoch:train:5701-5800batch: iter_time=1.132e-04, loss_ctc=76.700, loss_att=62.853, acc=0.720, loss=67.025, grad_norm=4.614, loss_scale=1.000, learning_rate=7.644e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-07 22:03:33,113 (deepspeed_trainer:228) INFO: 28epoch:train:5801-5900batch: iter_time=1.110e-04, loss_ctc=71.274, loss_att=49.329, acc=0.717, loss=55.931, grad_norm=5.368, loss_scale=1.000, learning_rate=7.643e-05, step_time=0.334 [2024-12-07 22:04:06,288] [INFO] [logging.py:129:log_dist] [Rank 0] step=411000, skipped=0, lr=[np.float64(7.641597902666543e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:04:06,291] [INFO] [timer.py:264:stop] epoch=0/micro_step=21000/global_step=21000, RunningAvgSamplesPerSec=49.32165573626132, CurrSamplesPerSec=50.098918697107045, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:04:06,294 (deepspeed_trainer:228) INFO: 28epoch:train:5901-6000batch: iter_time=1.122e-04, loss_ctc=72.414, loss_att=52.425, acc=0.710, loss=58.422, grad_norm=5.566, loss_scale=1.000, learning_rate=7.642e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:04:39,994 (deepspeed_trainer:228) INFO: 28epoch:train:6001-6100batch: iter_time=1.129e-04, loss_ctc=77.413, loss_att=67.781, acc=0.714, loss=70.647, grad_norm=6.629, loss_scale=1.000, learning_rate=7.641e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 22:05:13,428 (deepspeed_trainer:228) INFO: 28epoch:train:6101-6200batch: iter_time=1.106e-04, loss_ctc=81.896, loss_att=68.941, acc=0.696, loss=72.830, grad_norm=6.240, loss_scale=1.000, learning_rate=7.640e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 22:05:46,530 (deepspeed_trainer:228) INFO: 28epoch:train:6201-6300batch: iter_time=1.071e-04, loss_ctc=67.547, loss_att=54.624, acc=0.714, loss=58.506, grad_norm=5.161, loss_scale=1.000, learning_rate=7.639e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:06:19,757 (deepspeed_trainer:228) INFO: 28epoch:train:6301-6400batch: iter_time=1.084e-04, loss_ctc=78.846, loss_att=59.833, acc=0.711, loss=65.536, grad_norm=5.121, loss_scale=1.000, learning_rate=7.638e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 22:06:52,682 (deepspeed_trainer:228) INFO: 28epoch:train:6401-6500batch: iter_time=1.100e-04, loss_ctc=74.817, loss_att=52.454, acc=0.718, loss=59.184, grad_norm=6.390, loss_scale=1.000, learning_rate=7.637e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 22:07:25,811 (deepspeed_trainer:228) INFO: 28epoch:train:6501-6600batch: iter_time=1.104e-04, loss_ctc=61.587, loss_att=48.708, acc=0.716, loss=52.564, grad_norm=4.906, loss_scale=1.000, learning_rate=7.636e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:07:59,073 (deepspeed_trainer:228) INFO: 28epoch:train:6601-6700batch: iter_time=1.100e-04, loss_ctc=75.353, loss_att=63.103, acc=0.702, loss=66.766, grad_norm=6.021, loss_scale=1.000, learning_rate=7.636e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 22:08:32,254 (deepspeed_trainer:228) INFO: 28epoch:train:6701-6800batch: iter_time=1.110e-04, loss_ctc=71.846, loss_att=49.991, acc=0.722, loss=56.539, grad_norm=5.145, loss_scale=1.000, learning_rate=7.635e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 22:09:05,406 (deepspeed_trainer:228) INFO: 28epoch:train:6801-6900batch: iter_time=1.076e-04, loss_ctc=75.489, loss_att=56.245, acc=0.721, loss=62.024, grad_norm=5.212, loss_scale=1.000, learning_rate=7.634e-05, step_time=0.331 [2024-12-07 22:09:38,488] [INFO] [logging.py:129:log_dist] [Rank 0] step=412000, skipped=0, lr=[np.float64(7.632318507204926e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:09:38,489] [INFO] [timer.py:264:stop] epoch=0/micro_step=22000/global_step=22000, RunningAvgSamplesPerSec=49.336358287570405, CurrSamplesPerSec=47.575261261433354, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:09:38,491 (deepspeed_trainer:228) INFO: 28epoch:train:6901-7000batch: iter_time=1.077e-04, loss_ctc=71.839, loss_att=53.233, acc=0.720, loss=58.829, grad_norm=5.206, loss_scale=1.000, learning_rate=7.633e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 22:10:12,275 (deepspeed_trainer:228) INFO: 28epoch:train:7001-7100batch: iter_time=1.088e-04, loss_ctc=71.140, loss_att=53.431, acc=0.718, loss=58.724, grad_norm=5.780, loss_scale=1.000, learning_rate=7.632e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 22:10:45,555 (deepspeed_trainer:228) INFO: 28epoch:train:7101-7200batch: iter_time=1.107e-04, loss_ctc=74.259, loss_att=59.827, acc=0.711, loss=64.189, grad_norm=5.025, loss_scale=1.000, learning_rate=7.631e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 22:11:18,665 (deepspeed_trainer:228) INFO: 28epoch:train:7201-7300batch: iter_time=1.108e-04, loss_ctc=74.351, loss_att=57.640, acc=0.733, loss=62.672, grad_norm=4.933, loss_scale=1.000, learning_rate=7.630e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:11:51,499 (deepspeed_trainer:228) INFO: 28epoch:train:7301-7400batch: iter_time=1.096e-04, loss_ctc=66.000, loss_att=49.348, acc=0.729, loss=54.341, grad_norm=5.161, loss_scale=1.000, learning_rate=7.629e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 22:12:24,317 (deepspeed_trainer:228) INFO: 28epoch:train:7401-7500batch: iter_time=1.056e-04, loss_ctc=74.718, loss_att=56.931, acc=0.714, loss=62.290, grad_norm=5.203, loss_scale=1.000, learning_rate=7.628e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 22:12:28,056 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-07 22:12:55,081 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 22:13:12,124 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 22:13:12,124 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 22:13:12,126 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 22:13:36,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:36,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:36,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:36,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:36,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:39,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:39,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:39,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:39,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:39,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:40,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:41,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:41,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:41,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:41,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:13:44,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:24,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:24,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:24,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:25,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:25,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:28,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:28,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:29,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:29,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:29,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:30,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:30,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:30,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:31,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:31,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:14:38,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:11,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:12,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:13,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:14,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:17,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:17,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:18,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:19,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:19,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:19,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:19,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:19,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:20,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:20,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:20,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:30,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:59,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:15:59,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:02,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:05,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:07,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:07,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:07,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:08,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:08,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:09,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:09,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:09,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:10,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:10,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:11,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:16:22,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 22:17:21,715 (deepspeed_trainer:228) INFO: 28epoch:train:7501-7600batch: iter_time=2.531, loss_ctc=71.035, loss_att=53.873, acc=0.719, loss=59.022, grad_norm=5.056, loss_scale=1.000, learning_rate=7.627e-05, step_time=0.453 [cnode7-012:0/16] 2024-12-07 22:17:56,014 (deepspeed_trainer:228) INFO: 28epoch:train:7601-7700batch: iter_time=1.087e-04, loss_ctc=77.763, loss_att=61.803, acc=0.721, loss=66.601, grad_norm=5.026, loss_scale=1.000, learning_rate=7.626e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-07 22:18:29,723 (deepspeed_trainer:228) INFO: 28epoch:train:7701-7800batch: iter_time=1.123e-04, loss_ctc=78.583, loss_att=55.870, acc=0.709, loss=62.664, grad_norm=5.452, loss_scale=1.000, learning_rate=7.625e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 22:19:02,634 (deepspeed_trainer:228) INFO: 28epoch:train:7801-7900batch: iter_time=1.088e-04, loss_ctc=68.650, loss_att=53.928, acc=0.715, loss=58.361, grad_norm=4.941, loss_scale=1.000, learning_rate=7.624e-05, step_time=0.333 [2024-12-07 22:19:37,124] [INFO] [logging.py:129:log_dist] [Rank 0] step=413000, skipped=0, lr=[np.float64(7.623072834500416e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:19:37,125] [INFO] [timer.py:264:stop] epoch=0/micro_step=23000/global_step=23000, RunningAvgSamplesPerSec=49.25615578594566, CurrSamplesPerSec=50.97833903544754, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:19:37,128 (deepspeed_trainer:228) INFO: 28epoch:train:7901-8000batch: iter_time=1.085e-04, loss_ctc=78.249, loss_att=68.317, acc=0.706, loss=71.283, grad_norm=5.450, loss_scale=1.000, learning_rate=7.624e-05, step_time=0.345 [cnode7-012:0/16] 2024-12-07 22:20:11,071 (deepspeed_trainer:228) INFO: 28epoch:train:8001-8100batch: iter_time=1.118e-04, loss_ctc=80.403, loss_att=62.168, acc=0.714, loss=67.644, grad_norm=5.628, loss_scale=1.000, learning_rate=7.623e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-07 22:20:44,558 (deepspeed_trainer:228) INFO: 28epoch:train:8101-8200batch: iter_time=1.083e-04, loss_ctc=64.723, loss_att=51.015, acc=0.717, loss=55.099, grad_norm=4.947, loss_scale=1.000, learning_rate=7.622e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-07 22:21:18,348 (deepspeed_trainer:228) INFO: 28epoch:train:8201-8300batch: iter_time=1.080e-04, loss_ctc=84.910, loss_att=62.925, acc=0.710, loss=69.499, grad_norm=5.754, loss_scale=1.000, learning_rate=7.621e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 22:21:51,633 (deepspeed_trainer:228) INFO: 28epoch:train:8301-8400batch: iter_time=1.112e-04, loss_ctc=68.871, loss_att=52.045, acc=0.719, loss=57.069, grad_norm=5.218, loss_scale=1.000, learning_rate=7.620e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 22:22:25,241 (deepspeed_trainer:228) INFO: 28epoch:train:8401-8500batch: iter_time=1.098e-04, loss_ctc=71.350, loss_att=57.568, acc=0.709, loss=61.739, grad_norm=5.841, loss_scale=1.000, learning_rate=7.619e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 22:22:58,961 (deepspeed_trainer:228) INFO: 28epoch:train:8501-8600batch: iter_time=1.090e-04, loss_ctc=65.315, loss_att=51.832, acc=0.717, loss=55.865, grad_norm=4.931, loss_scale=1.000, learning_rate=7.618e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 22:23:32,821 (deepspeed_trainer:228) INFO: 28epoch:train:8601-8700batch: iter_time=1.103e-04, loss_ctc=75.766, loss_att=52.702, acc=0.722, loss=59.600, grad_norm=5.206, loss_scale=1.000, learning_rate=7.617e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 22:24:06,918 (deepspeed_trainer:228) INFO: 28epoch:train:8701-8800batch: iter_time=1.089e-04, loss_ctc=73.120, loss_att=55.708, acc=0.721, loss=60.904, grad_norm=5.015, loss_scale=1.000, learning_rate=7.616e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-07 22:24:40,466 (deepspeed_trainer:228) INFO: 28epoch:train:8801-8900batch: iter_time=1.113e-04, loss_ctc=67.438, loss_att=49.807, acc=0.722, loss=55.065, grad_norm=5.117, loss_scale=1.000, learning_rate=7.615e-05, step_time=0.335 [2024-12-07 22:25:14,090] [INFO] [logging.py:129:log_dist] [Rank 0] step=414000, skipped=0, lr=[np.float64(7.613860680790448e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:25:14,091] [INFO] [timer.py:264:stop] epoch=0/micro_step=24000/global_step=24000, RunningAvgSamplesPerSec=49.242065063622135, CurrSamplesPerSec=49.10330928664032, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:25:14,093 (deepspeed_trainer:228) INFO: 28epoch:train:8901-9000batch: iter_time=1.097e-04, loss_ctc=75.263, loss_att=58.173, acc=0.710, loss=63.262, grad_norm=5.828, loss_scale=1.000, learning_rate=7.614e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 22:25:47,909 (deepspeed_trainer:228) INFO: 28epoch:train:9001-9100batch: iter_time=1.099e-04, loss_ctc=69.491, loss_att=57.306, acc=0.721, loss=60.976, grad_norm=4.874, loss_scale=1.000, learning_rate=7.613e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 22:26:21,573 (deepspeed_trainer:228) INFO: 28epoch:train:9101-9200batch: iter_time=1.081e-04, loss_ctc=74.944, loss_att=57.682, acc=0.729, loss=62.871, grad_norm=5.097, loss_scale=1.000, learning_rate=7.612e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 22:26:54,699 (deepspeed_trainer:228) INFO: 28epoch:train:9201-9300batch: iter_time=1.097e-04, loss_ctc=68.341, loss_att=51.759, acc=0.727, loss=56.750, grad_norm=4.853, loss_scale=1.000, learning_rate=7.612e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:27:24,587 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-07 22:27:52,086 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 22:28:09,256 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 22:28:09,256 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 22:28:09,258 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 22:28:31,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:33,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:33,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:33,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:33,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:33,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:35,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:37,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:37,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:38,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:37,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:38,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:38,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:39,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:39,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:28:39,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:20,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:21,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:22,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:22,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:22,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:23,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:26,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:26,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:27,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:27,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:28,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:28,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:28,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:28,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:29,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:29:29,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:07,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:09,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:10,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:11,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:11,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:14,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:14,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:15,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:16,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:16,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:17,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:17,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:17,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:17,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:18,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:18,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:56,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:58,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:58,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:30:59,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:00,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:03,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:04,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:04,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:04,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:05,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:06,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:05,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:06,383] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:06,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:06,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:31:07,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 22:31:43,213 (deepspeed_trainer:228) INFO: 28epoch:train:9301-9400batch: iter_time=2.529, loss_ctc=71.347, loss_att=51.634, acc=0.718, loss=57.547, grad_norm=4.865, loss_scale=1.000, learning_rate=7.611e-05, step_time=0.356 [cnode7-012:0/16] 2024-12-07 22:32:16,251 (deepspeed_trainer:228) INFO: 28epoch:train:9401-9500batch: iter_time=1.134e-04, loss_ctc=76.767, loss_att=59.402, acc=0.712, loss=64.644, grad_norm=5.587, loss_scale=1.000, learning_rate=7.610e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 22:32:49,679 (deepspeed_trainer:228) INFO: 28epoch:train:9501-9600batch: iter_time=1.057e-04, loss_ctc=70.691, loss_att=58.282, acc=0.708, loss=62.049, grad_norm=5.200, loss_scale=1.000, learning_rate=7.609e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 22:33:23,793 (deepspeed_trainer:228) INFO: 28epoch:train:9601-9700batch: iter_time=1.098e-04, loss_ctc=76.401, loss_att=56.526, acc=0.701, loss=62.507, grad_norm=5.347, loss_scale=1.000, learning_rate=7.608e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-07 22:33:57,189 (deepspeed_trainer:228) INFO: 28epoch:train:9701-9800batch: iter_time=1.109e-04, loss_ctc=72.491, loss_att=60.865, acc=0.710, loss=64.343, grad_norm=5.161, loss_scale=1.000, learning_rate=7.607e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 22:34:30,603 (deepspeed_trainer:228) INFO: 28epoch:train:9801-9900batch: iter_time=1.089e-04, loss_ctc=81.874, loss_att=68.272, acc=0.705, loss=72.349, grad_norm=5.614, loss_scale=1.000, learning_rate=7.606e-05, step_time=0.334 [2024-12-07 22:35:04,892] [INFO] [logging.py:129:log_dist] [Rank 0] step=415000, skipped=0, lr=[np.float64(7.604681844031973e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:35:04,893] [INFO] [timer.py:264:stop] epoch=0/micro_step=25000/global_step=25000, RunningAvgSamplesPerSec=49.225110445891474, CurrSamplesPerSec=51.143882110644554, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:35:04,895 (deepspeed_trainer:228) INFO: 28epoch:train:9901-10000batch: iter_time=1.088e-04, loss_ctc=75.681, loss_att=56.543, acc=0.715, loss=62.289, grad_norm=5.409, loss_scale=1.000, learning_rate=7.605e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 22:35:38,691 (deepspeed_trainer:228) INFO: 28epoch:train:10001-10100batch: iter_time=1.100e-04, loss_ctc=69.011, loss_att=55.127, acc=0.704, loss=59.291, grad_norm=4.931, loss_scale=1.000, learning_rate=7.604e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 22:36:11,681 (deepspeed_trainer:228) INFO: 28epoch:train:10101-10200batch: iter_time=1.099e-04, loss_ctc=81.109, loss_att=57.139, acc=0.710, loss=64.339, grad_norm=5.725, loss_scale=1.000, learning_rate=7.603e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 22:36:44,556 (deepspeed_trainer:228) INFO: 28epoch:train:10201-10300batch: iter_time=1.090e-04, loss_ctc=67.911, loss_att=50.759, acc=0.715, loss=55.944, grad_norm=5.206, loss_scale=1.000, learning_rate=7.602e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 22:37:17,265 (deepspeed_trainer:228) INFO: 28epoch:train:10301-10400batch: iter_time=1.112e-04, loss_ctc=72.017, loss_att=57.553, acc=0.706, loss=61.899, grad_norm=5.855, loss_scale=1.000, learning_rate=7.601e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 22:37:50,122 (deepspeed_trainer:228) INFO: 28epoch:train:10401-10500batch: iter_time=1.090e-04, loss_ctc=63.266, loss_att=50.193, acc=0.715, loss=54.137, grad_norm=5.735, loss_scale=1.000, learning_rate=7.601e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 22:38:23,449 (deepspeed_trainer:228) INFO: 28epoch:train:10501-10600batch: iter_time=1.126e-04, loss_ctc=79.321, loss_att=54.238, acc=0.717, loss=61.759, grad_norm=6.062, loss_scale=1.000, learning_rate=7.600e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 22:38:56,582 (deepspeed_trainer:228) INFO: 28epoch:train:10601-10700batch: iter_time=1.099e-04, loss_ctc=70.754, loss_att=51.468, acc=0.725, loss=57.249, grad_norm=4.816, loss_scale=1.000, learning_rate=7.599e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:39:29,419 (deepspeed_trainer:228) INFO: 28epoch:train:10701-10800batch: iter_time=1.102e-04, loss_ctc=70.513, loss_att=52.981, acc=0.709, loss=58.237, grad_norm=5.036, loss_scale=1.000, learning_rate=7.598e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 22:40:02,199 (deepspeed_trainer:228) INFO: 28epoch:train:10801-10900batch: iter_time=1.119e-04, loss_ctc=71.380, loss_att=53.662, acc=0.708, loss=58.997, grad_norm=5.453, loss_scale=1.000, learning_rate=7.597e-05, step_time=0.327 [2024-12-07 22:40:35,182] [INFO] [logging.py:129:log_dist] [Rank 0] step=416000, skipped=0, lr=[np.float64(7.595536123882837e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:40:35,183] [INFO] [timer.py:264:stop] epoch=0/micro_step=26000/global_step=26000, RunningAvgSamplesPerSec=49.25455502186318, CurrSamplesPerSec=49.152437008941405, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:40:35,186 (deepspeed_trainer:228) INFO: 28epoch:train:10901-11000batch: iter_time=1.097e-04, loss_ctc=72.440, loss_att=60.344, acc=0.715, loss=63.976, grad_norm=4.818, loss_scale=1.000, learning_rate=7.596e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 22:41:07,921 (deepspeed_trainer:228) INFO: 28epoch:train:11001-11100batch: iter_time=1.093e-04, loss_ctc=69.517, loss_att=52.978, acc=0.727, loss=57.922, grad_norm=5.022, loss_scale=1.000, learning_rate=7.595e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 22:41:40,196 (deepspeed_trainer:228) INFO: 28epoch:train:11101-11200batch: iter_time=1.113e-04, loss_ctc=70.713, loss_att=54.080, acc=0.715, loss=59.083, grad_norm=5.229, loss_scale=1.000, learning_rate=7.594e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 22:42:01,258 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-07 22:42:29,050 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 22:42:45,590 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 22:42:45,590 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 22:42:45,592 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 22:43:10,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:11,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:11,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:11,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:11,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:12,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:12,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:12,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:13,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:12,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:12,982] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:13,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:13,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:14,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:14,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:15,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:43:59,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:00,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:00,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:00,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:00,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:00,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:01,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:01,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:01,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:01,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:02,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:02,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:03,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:03,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:04,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:05,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:47,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:49,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:49,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:49,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:49,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:50,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:50,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:51,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:52,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:52,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:52,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:52,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:52,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:53,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:53,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:44:54,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:34,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:37,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:37,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:37,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:37,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:38,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:40,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:40,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:40,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:40,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:41,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:41,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:42,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:43,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:43,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:45:44,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 22:46:29,807 (deepspeed_trainer:228) INFO: 28epoch:train:11201-11300batch: iter_time=2.522, loss_ctc=75.340, loss_att=54.664, acc=0.710, loss=60.873, grad_norm=5.184, loss_scale=1.000, learning_rate=7.593e-05, step_time=0.374 [cnode7-012:0/16] 2024-12-07 22:47:03,854 (deepspeed_trainer:228) INFO: 28epoch:train:11301-11400batch: iter_time=1.062e-04, loss_ctc=73.980, loss_att=57.964, acc=0.720, loss=62.749, grad_norm=4.806, loss_scale=1.000, learning_rate=7.592e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-07 22:47:37,676 (deepspeed_trainer:228) INFO: 28epoch:train:11401-11500batch: iter_time=1.126e-04, loss_ctc=74.030, loss_att=56.888, acc=0.710, loss=62.029, grad_norm=5.183, loss_scale=1.000, learning_rate=7.591e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-07 22:48:10,792 (deepspeed_trainer:228) INFO: 28epoch:train:11501-11600batch: iter_time=1.144e-04, loss_ctc=72.728, loss_att=53.495, acc=0.701, loss=59.258, grad_norm=5.086, loss_scale=1.000, learning_rate=7.591e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 22:48:44,078 (deepspeed_trainer:228) INFO: 28epoch:train:11601-11700batch: iter_time=1.129e-04, loss_ctc=73.803, loss_att=59.503, acc=0.720, loss=63.822, grad_norm=5.210, loss_scale=1.000, learning_rate=7.590e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 22:49:17,657 (deepspeed_trainer:228) INFO: 28epoch:train:11701-11800batch: iter_time=1.115e-04, loss_ctc=80.548, loss_att=71.930, acc=0.697, loss=74.530, grad_norm=5.633, loss_scale=1.000, learning_rate=7.589e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-07 22:49:51,033 (deepspeed_trainer:228) INFO: 28epoch:train:11801-11900batch: iter_time=1.091e-04, loss_ctc=70.194, loss_att=50.235, acc=0.720, loss=56.210, grad_norm=5.582, loss_scale=1.000, learning_rate=7.588e-05, step_time=0.333 [2024-12-07 22:50:23,963] [INFO] [logging.py:129:log_dist] [Rank 0] step=417000, skipped=0, lr=[np.float64(7.586423321683429e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:50:23,963] [INFO] [timer.py:264:stop] epoch=0/micro_step=27000/global_step=27000, RunningAvgSamplesPerSec=49.246800247947625, CurrSamplesPerSec=51.61715276321525, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:50:23,966 (deepspeed_trainer:228) INFO: 28epoch:train:11901-12000batch: iter_time=1.119e-04, loss_ctc=72.882, loss_att=56.853, acc=0.707, loss=61.658, grad_norm=5.569, loss_scale=1.000, learning_rate=7.587e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 22:50:56,953 (deepspeed_trainer:228) INFO: 28epoch:train:12001-12100batch: iter_time=1.133e-04, loss_ctc=78.606, loss_att=54.285, acc=0.719, loss=61.545, grad_norm=5.436, loss_scale=1.000, learning_rate=7.586e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 22:51:29,605 (deepspeed_trainer:228) INFO: 28epoch:train:12101-12200batch: iter_time=1.128e-04, loss_ctc=66.310, loss_att=51.195, acc=0.716, loss=55.754, grad_norm=5.186, loss_scale=1.000, learning_rate=7.585e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 22:52:02,160 (deepspeed_trainer:228) INFO: 28epoch:train:12201-12300batch: iter_time=1.132e-04, loss_ctc=73.111, loss_att=59.528, acc=0.702, loss=63.622, grad_norm=5.717, loss_scale=1.000, learning_rate=7.584e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 22:52:34,612 (deepspeed_trainer:228) INFO: 28epoch:train:12301-12400batch: iter_time=1.124e-04, loss_ctc=67.509, loss_att=49.717, acc=0.720, loss=55.039, grad_norm=4.607, loss_scale=1.000, learning_rate=7.583e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-07 22:53:07,070 (deepspeed_trainer:228) INFO: 28epoch:train:12401-12500batch: iter_time=1.150e-04, loss_ctc=73.354, loss_att=50.179, acc=0.717, loss=57.114, grad_norm=5.088, loss_scale=1.000, learning_rate=7.582e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-07 22:53:39,727 (deepspeed_trainer:228) INFO: 28epoch:train:12501-12600batch: iter_time=1.114e-04, loss_ctc=77.306, loss_att=57.649, acc=0.720, loss=63.531, grad_norm=4.767, loss_scale=1.000, learning_rate=7.581e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 22:54:12,389 (deepspeed_trainer:228) INFO: 28epoch:train:12601-12700batch: iter_time=1.124e-04, loss_ctc=69.836, loss_att=51.090, acc=0.716, loss=56.697, grad_norm=5.127, loss_scale=1.000, learning_rate=7.581e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 22:54:45,177 (deepspeed_trainer:228) INFO: 28epoch:train:12701-12800batch: iter_time=1.115e-04, loss_ctc=72.679, loss_att=56.564, acc=0.704, loss=61.394, grad_norm=5.032, loss_scale=1.000, learning_rate=7.580e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 22:55:18,201 (deepspeed_trainer:228) INFO: 28epoch:train:12801-12900batch: iter_time=1.118e-04, loss_ctc=68.364, loss_att=55.047, acc=0.725, loss=59.038, grad_norm=4.699, loss_scale=1.000, learning_rate=7.579e-05, step_time=0.330 [2024-12-07 22:55:51,623] [INFO] [logging.py:129:log_dist] [Rank 0] step=418000, skipped=0, lr=[np.float64(7.577343240438542e-05)], mom=[[0.9, 0.98]] [2024-12-07 22:55:51,624] [INFO] [timer.py:264:stop] epoch=0/micro_step=28000/global_step=28000, RunningAvgSamplesPerSec=49.287482934224144, CurrSamplesPerSec=50.413550068808476, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 22:55:51,627 (deepspeed_trainer:228) INFO: 28epoch:train:12901-13000batch: iter_time=1.289e-04, loss_ctc=67.046, loss_att=51.712, acc=0.723, loss=56.311, grad_norm=4.720, loss_scale=1.000, learning_rate=7.578e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 22:56:24,249 (deepspeed_trainer:228) INFO: 28epoch:train:13001-13100batch: iter_time=1.108e-04, loss_ctc=73.019, loss_att=55.156, acc=0.710, loss=60.520, grad_norm=5.118, loss_scale=1.000, learning_rate=7.577e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 22:56:37,261 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-07 22:57:03,855 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 22:57:20,043 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 22:57:20,043 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 22:57:20,045 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 22:57:45,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:45,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:45,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:45,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:45,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:46,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:47,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:48,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:48,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:49,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:49,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:49,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:49,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:50,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:50,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:57:50,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:33,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:34,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:33,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:34,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:34,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:35,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:38,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:37,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:38,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:38,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:39,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:38,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:39,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:39,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:39,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:58:39,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:21,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:22,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:21,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:22,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:23,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:24,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:25,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:26,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:27,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:27,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:27,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:27,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:28,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:27,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:28,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 22:59:32,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:10,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:10,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:10,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:11,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:13,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:13,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:16,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:15,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:16,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:16,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:16,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:16,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:17,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:17,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:18,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:00:24,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 23:01:15,234 (deepspeed_trainer:228) INFO: 28epoch:train:13101-13200batch: iter_time=2.512, loss_ctc=76.103, loss_att=55.544, acc=0.717, loss=61.702, grad_norm=5.605, loss_scale=1.000, learning_rate=7.576e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-07 23:01:48,698 (deepspeed_trainer:228) INFO: 28epoch:train:13201-13300batch: iter_time=1.083e-04, loss_ctc=75.981, loss_att=62.458, acc=0.717, loss=66.532, grad_norm=4.759, loss_scale=1.000, learning_rate=7.575e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 23:02:21,421 (deepspeed_trainer:228) INFO: 28epoch:train:13301-13400batch: iter_time=1.056e-04, loss_ctc=70.665, loss_att=49.685, acc=0.711, loss=55.968, grad_norm=5.380, loss_scale=1.000, learning_rate=7.574e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 23:02:53,976 (deepspeed_trainer:228) INFO: 28epoch:train:13401-13500batch: iter_time=1.104e-04, loss_ctc=71.217, loss_att=51.903, acc=0.713, loss=57.717, grad_norm=5.606, loss_scale=1.000, learning_rate=7.573e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 23:03:26,905 (deepspeed_trainer:228) INFO: 28epoch:train:13501-13600batch: iter_time=1.085e-04, loss_ctc=76.229, loss_att=65.035, acc=0.716, loss=68.395, grad_norm=5.334, loss_scale=1.000, learning_rate=7.572e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:03:59,819 (deepspeed_trainer:228) INFO: 28epoch:train:13601-13700batch: iter_time=1.097e-04, loss_ctc=81.273, loss_att=67.633, acc=0.696, loss=71.700, grad_norm=6.051, loss_scale=1.000, learning_rate=7.571e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:04:32,356 (deepspeed_trainer:228) INFO: 28epoch:train:13701-13800batch: iter_time=1.099e-04, loss_ctc=66.571, loss_att=52.173, acc=0.713, loss=56.499, grad_norm=5.003, loss_scale=1.000, learning_rate=7.571e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 23:05:05,133 (deepspeed_trainer:228) INFO: 28epoch:train:13801-13900batch: iter_time=1.080e-04, loss_ctc=78.471, loss_att=58.714, acc=0.710, loss=64.640, grad_norm=5.170, loss_scale=1.000, learning_rate=7.570e-05, step_time=0.327 [2024-12-07 23:05:37,615] [INFO] [logging.py:129:log_dist] [Rank 0] step=419000, skipped=0, lr=[np.float64(7.568295684799499e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:05:37,616] [INFO] [timer.py:264:stop] epoch=0/micro_step=29000/global_step=29000, RunningAvgSamplesPerSec=49.28627543488483, CurrSamplesPerSec=50.992748809050845, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 23:05:37,618 (deepspeed_trainer:228) INFO: 28epoch:train:13901-14000batch: iter_time=1.135e-04, loss_ctc=73.118, loss_att=51.025, acc=0.717, loss=57.626, grad_norm=5.526, loss_scale=1.000, learning_rate=7.569e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 23:06:10,152 (deepspeed_trainer:228) INFO: 28epoch:train:14001-14100batch: iter_time=1.084e-04, loss_ctc=61.148, loss_att=46.908, acc=0.716, loss=51.172, grad_norm=4.995, loss_scale=1.000, learning_rate=7.568e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 23:06:42,838 (deepspeed_trainer:228) INFO: 28epoch:train:14101-14200batch: iter_time=1.082e-04, loss_ctc=75.308, loss_att=61.893, acc=0.701, loss=65.893, grad_norm=6.517, loss_scale=1.000, learning_rate=7.567e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 23:07:15,429 (deepspeed_trainer:228) INFO: 28epoch:train:14201-14300batch: iter_time=1.074e-04, loss_ctc=71.016, loss_att=49.321, acc=0.722, loss=55.870, grad_norm=5.233, loss_scale=1.000, learning_rate=7.566e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-07 23:07:48,246 (deepspeed_trainer:228) INFO: 28epoch:train:14301-14400batch: iter_time=1.080e-04, loss_ctc=75.657, loss_att=53.769, acc=0.722, loss=60.316, grad_norm=5.244, loss_scale=1.000, learning_rate=7.565e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 23:08:21,159 (deepspeed_trainer:228) INFO: 28epoch:train:14401-14500batch: iter_time=1.148e-04, loss_ctc=70.490, loss_att=51.462, acc=0.719, loss=57.173, grad_norm=4.822, loss_scale=1.000, learning_rate=7.564e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:08:54,038 (deepspeed_trainer:228) INFO: 28epoch:train:14501-14600batch: iter_time=1.068e-04, loss_ctc=69.785, loss_att=51.186, acc=0.716, loss=56.778, grad_norm=5.140, loss_scale=1.000, learning_rate=7.563e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:09:27,089 (deepspeed_trainer:228) INFO: 28epoch:train:14601-14700batch: iter_time=1.086e-04, loss_ctc=72.873, loss_att=59.309, acc=0.703, loss=63.385, grad_norm=5.461, loss_scale=1.000, learning_rate=7.562e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:10:00,010 (deepspeed_trainer:228) INFO: 28epoch:train:14701-14800batch: iter_time=1.090e-04, loss_ctc=72.883, loss_att=56.504, acc=0.732, loss=61.448, grad_norm=5.319, loss_scale=1.000, learning_rate=7.562e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:10:32,886 (deepspeed_trainer:228) INFO: 28epoch:train:14801-14900batch: iter_time=1.072e-04, loss_ctc=64.710, loss_att=48.577, acc=0.723, loss=53.422, grad_norm=5.026, loss_scale=1.000, learning_rate=7.561e-05, step_time=0.328 [2024-12-07 23:11:06,389] [INFO] [logging.py:129:log_dist] [Rank 0] step=420000, skipped=0, lr=[np.float64(7.559280461046496e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:11:06,390] [INFO] [timer.py:264:stop] epoch=0/micro_step=30000/global_step=30000, RunningAvgSamplesPerSec=49.31499167911078, CurrSamplesPerSec=50.579518202981234, MemAllocated=2.04GB, MaxMemAllocated=28.21GB [cnode7-012:0/16] 2024-12-07 23:11:06,392 (deepspeed_trainer:228) INFO: 28epoch:train:14901-15000batch: iter_time=1.068e-04, loss_ctc=73.791, loss_att=56.541, acc=0.710, loss=61.701, grad_norm=5.429, loss_scale=1.000, learning_rate=7.560e-05, step_time=0.335 [2024-12-07 23:11:21,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:23,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:24,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:36,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:38,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:39,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:41,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:41,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:41,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:41,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:40,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:41,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:41,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:52,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:55,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:55,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:56,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:55,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:56,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:56,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:56,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:56,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:56,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:57,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:57,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:57,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:57,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:57,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:11:57,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:06,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:10,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:11,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:11,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:11,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:12,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:11,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:12,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:12,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:13,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:13,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:13,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:13,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:13,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:13,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:14,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:12:27,720] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 28 is about to be saved! [2024-12-07 23:12:27,751] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/mp_rank_00_model_states.pt [2024-12-07 23:12:27,751] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/mp_rank_00_model_states.pt... [2024-12-07 23:12:29,576] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/mp_rank_00_model_states.pt. [2024-12-07 23:12:29,732] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,732] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,211] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,212] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,213] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,214] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,215] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,739] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,739] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,739] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,740] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,219] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,220] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,220] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,741] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-07 23:12:29,741] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-07 23:12:30,395] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,396] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,396] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:29,892] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-07 23:12:29,892] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-07 23:12:29,892] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,496] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,497] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,497] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,497] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,497] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,497] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:29,975] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-07 23:12:29,975] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-07 23:12:29,975] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:29,996] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-07 23:12:29,996] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-07 23:12:29,996] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,012] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,012] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,012] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,063] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,063] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,063] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,064] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,064] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,064] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,069] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,069] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,069] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:30,075] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-07 23:12:30,075] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-07 23:12:30,075] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:32,173] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-07 23:12:32,173] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-07 23:12:32,173] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:32,756] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-07 23:12:32,761] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-07 23:12:32,761] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:33,004] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-07 23:12:33,004] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-07 23:12:33,004] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:33,008] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-07 23:12:33,009] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-07 23:12:33,009] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [2024-12-07 23:12:33,071] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-07 23:12:33,071] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_28/28/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-07 23:12:33,071] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 28 is ready now! [cnode7-012:0/16] 2024-12-07 23:12:33,108 (deepspeed_trainer:158) INFO: 28epoch results: [train] iter_time=0.134, loss_ctc=73.299, loss_att=56.231, acc=0.714, loss=61.351, grad_norm=5.285, loss_scale=1.000, learning_rate=7.628e-05, step_time=0.334, time=1 hour, 57 minutes and 24.9 seconds, total_count=420028, gpu_max_cached_mem_GB=37.916, [valid] loss_ctc=4.344, cer_ctc=0.142, loss_att=5.938, acc=0.908, cer=0.086, wer=0.625, loss=5.469, time=1 minute and 11.24 seconds, total_count=28, gpu_max_cached_mem_GB=37.916 [cnode7-012:0/16] 2024-12-07 23:12:34,323 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-07 23:13:01,233 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 23:13:18,439 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 23:13:18,439 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 23:13:18,442 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 23:13:38,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:38,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:39,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:39,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:39,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:39,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:40,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:41,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:40,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:40,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:41,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:41,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:42,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:42,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:42,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:13:44,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:25,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:26,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:26,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:26,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:26,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:26,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:28,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:28,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:29,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:28,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:29,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:29,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:29,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:30,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:32,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:14:31,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:13,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:14,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:13,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:13,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:14,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:14,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:17,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:17,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:18,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:17,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:18,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:18,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:19,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:20,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:20,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:15:20,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:01,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:01,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:01,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:01,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:01,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:05,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:07,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:07,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:06,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:07,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:07,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:07,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:07,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:08,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:08,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:16:09,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 23:17:11,338 (deepspeed_trainer:228) INFO: 29epoch:train:1-100batch: iter_time=2.413, loss_ctc=72.596, loss_att=54.909, acc=0.715, loss=60.214, grad_norm=5.124, loss_scale=1.000, learning_rate=7.559e-05, step_time=0.357 [cnode7-012:0/16] 2024-12-07 23:17:45,019 (deepspeed_trainer:228) INFO: 29epoch:train:101-200batch: iter_time=1.143e-04, loss_ctc=78.796, loss_att=61.662, acc=0.709, loss=66.809, grad_norm=5.335, loss_scale=1.000, learning_rate=7.558e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-07 23:18:18,311 (deepspeed_trainer:228) INFO: 29epoch:train:201-300batch: iter_time=1.134e-04, loss_ctc=71.455, loss_att=55.160, acc=0.723, loss=60.063, grad_norm=5.232, loss_scale=1.000, learning_rate=7.557e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:18:51,589 (deepspeed_trainer:228) INFO: 29epoch:train:301-400batch: iter_time=1.167e-04, loss_ctc=82.797, loss_att=62.889, acc=0.697, loss=68.869, grad_norm=6.927, loss_scale=1.000, learning_rate=7.556e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:19:24,951 (deepspeed_trainer:228) INFO: 29epoch:train:401-500batch: iter_time=1.149e-04, loss_ctc=82.418, loss_att=58.185, acc=0.704, loss=65.441, grad_norm=6.436, loss_scale=1.000, learning_rate=7.555e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:19:58,153 (deepspeed_trainer:228) INFO: 29epoch:train:501-600batch: iter_time=1.097e-04, loss_ctc=70.949, loss_att=54.780, acc=0.707, loss=59.605, grad_norm=5.652, loss_scale=1.000, learning_rate=7.554e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:20:31,406 (deepspeed_trainer:228) INFO: 29epoch:train:601-700batch: iter_time=1.164e-04, loss_ctc=70.964, loss_att=49.297, acc=0.712, loss=55.795, grad_norm=5.231, loss_scale=1.000, learning_rate=7.553e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:21:04,515 (deepspeed_trainer:228) INFO: 29epoch:train:701-800batch: iter_time=1.211e-04, loss_ctc=81.347, loss_att=62.194, acc=0.708, loss=67.964, grad_norm=5.425, loss_scale=1.000, learning_rate=7.553e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:21:37,477 (deepspeed_trainer:228) INFO: 29epoch:train:801-900batch: iter_time=1.139e-04, loss_ctc=76.016, loss_att=54.935, acc=0.705, loss=61.253, grad_norm=5.415, loss_scale=1.000, learning_rate=7.552e-05, step_time=0.329 [2024-12-07 23:22:10,405] [INFO] [logging.py:129:log_dist] [Rank 0] step=421000, skipped=0, lr=[np.float64(7.550297377071177e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:22:10,406] [INFO] [timer.py:264:stop] epoch=0/micro_step=31000/global_step=31000, RunningAvgSamplesPerSec=49.31564762229555, CurrSamplesPerSec=51.20589152535308, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-07 23:22:10,410 (deepspeed_trainer:228) INFO: 29epoch:train:901-1000batch: iter_time=1.144e-04, loss_ctc=73.658, loss_att=54.588, acc=0.710, loss=60.297, grad_norm=5.184, loss_scale=1.000, learning_rate=7.551e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:22:43,520 (deepspeed_trainer:228) INFO: 29epoch:train:1001-1100batch: iter_time=1.112e-04, loss_ctc=74.290, loss_att=54.968, acc=0.709, loss=60.788, grad_norm=5.045, loss_scale=1.000, learning_rate=7.550e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:23:16,646 (deepspeed_trainer:228) INFO: 29epoch:train:1101-1200batch: iter_time=1.089e-04, loss_ctc=72.273, loss_att=51.182, acc=0.715, loss=57.496, grad_norm=5.291, loss_scale=1.000, learning_rate=7.549e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:23:49,953 (deepspeed_trainer:228) INFO: 29epoch:train:1201-1300batch: iter_time=1.170e-04, loss_ctc=64.941, loss_att=54.301, acc=0.717, loss=57.466, grad_norm=4.714, loss_scale=1.000, learning_rate=7.548e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:24:23,421 (deepspeed_trainer:228) INFO: 29epoch:train:1301-1400batch: iter_time=1.213e-04, loss_ctc=82.824, loss_att=57.604, acc=0.702, loss=65.156, grad_norm=5.821, loss_scale=1.000, learning_rate=7.547e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 23:24:56,721 (deepspeed_trainer:228) INFO: 29epoch:train:1401-1500batch: iter_time=1.145e-04, loss_ctc=75.031, loss_att=56.606, acc=0.704, loss=62.133, grad_norm=5.680, loss_scale=1.000, learning_rate=7.546e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:25:29,793 (deepspeed_trainer:228) INFO: 29epoch:train:1501-1600batch: iter_time=1.128e-04, loss_ctc=72.142, loss_att=50.535, acc=0.724, loss=57.033, grad_norm=5.570, loss_scale=1.000, learning_rate=7.545e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:26:03,104 (deepspeed_trainer:228) INFO: 29epoch:train:1601-1700batch: iter_time=1.138e-04, loss_ctc=75.897, loss_att=54.592, acc=0.718, loss=60.988, grad_norm=5.306, loss_scale=1.000, learning_rate=7.544e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:26:36,466 (deepspeed_trainer:228) INFO: 29epoch:train:1701-1800batch: iter_time=1.262e-04, loss_ctc=76.295, loss_att=59.162, acc=0.708, loss=64.300, grad_norm=6.296, loss_scale=1.000, learning_rate=7.544e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:27:06,369 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-07 23:27:32,483 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 23:27:49,175 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 23:27:49,176 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 23:27:49,178 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 23:28:15,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:16,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:16,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:16,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:16,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:16,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:17,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:17,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:18,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:18,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:19,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:19,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:19,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:19,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:19,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:28:20,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:04,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:06,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:07,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:07,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:07,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:08,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:08,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:08,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:09,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:09,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:09,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:09,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:10,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:10,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:10,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:12,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:54,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:54,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:56,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:56,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:56,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:56,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:57,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:57,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:59,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:59,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:59,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:59,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:00,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:59,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:29:59,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:03,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:42,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:43,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:44,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:44,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:45,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:46,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:46,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:46,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:48,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:49,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:49,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:49,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:50,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:50,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:51,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:30:55,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 23:31:31,031 (deepspeed_trainer:228) INFO: 29epoch:train:1801-1900batch: iter_time=2.524, loss_ctc=81.826, loss_att=63.111, acc=0.712, loss=68.731, grad_norm=6.583, loss_scale=1.000, learning_rate=7.543e-05, step_time=0.421 [2024-12-07 23:32:04,280] [INFO] [logging.py:129:log_dist] [Rank 0] step=422000, skipped=0, lr=[np.float64(7.541346242359447e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:32:04,281] [INFO] [timer.py:264:stop] epoch=0/micro_step=32000/global_step=32000, RunningAvgSamplesPerSec=49.28427096788919, CurrSamplesPerSec=47.5969578504906, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-07 23:32:04,284 (deepspeed_trainer:228) INFO: 29epoch:train:1901-2000batch: iter_time=1.099e-04, loss_ctc=70.472, loss_att=53.257, acc=0.722, loss=58.419, grad_norm=4.945, loss_scale=1.000, learning_rate=7.542e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:32:37,412 (deepspeed_trainer:228) INFO: 29epoch:train:2001-2100batch: iter_time=1.102e-04, loss_ctc=74.972, loss_att=57.489, acc=0.720, loss=62.748, grad_norm=5.436, loss_scale=1.000, learning_rate=7.541e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:33:10,261 (deepspeed_trainer:228) INFO: 29epoch:train:2101-2200batch: iter_time=1.074e-04, loss_ctc=73.425, loss_att=58.646, acc=0.718, loss=63.081, grad_norm=4.965, loss_scale=1.000, learning_rate=7.540e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 23:33:43,269 (deepspeed_trainer:228) INFO: 29epoch:train:2201-2300batch: iter_time=1.070e-04, loss_ctc=83.884, loss_att=66.128, acc=0.701, loss=71.457, grad_norm=6.081, loss_scale=1.000, learning_rate=7.539e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:34:15,821 (deepspeed_trainer:228) INFO: 29epoch:train:2301-2400batch: iter_time=1.052e-04, loss_ctc=71.558, loss_att=48.129, acc=0.722, loss=55.161, grad_norm=6.069, loss_scale=1.000, learning_rate=7.538e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-07 23:34:48,594 (deepspeed_trainer:228) INFO: 29epoch:train:2401-2500batch: iter_time=1.061e-04, loss_ctc=71.923, loss_att=58.310, acc=0.697, loss=62.418, grad_norm=5.761, loss_scale=1.000, learning_rate=7.537e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-07 23:35:21,384 (deepspeed_trainer:228) INFO: 29epoch:train:2501-2600batch: iter_time=1.115e-04, loss_ctc=74.699, loss_att=54.861, acc=0.715, loss=60.808, grad_norm=5.428, loss_scale=1.000, learning_rate=7.536e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 23:35:54,286 (deepspeed_trainer:228) INFO: 29epoch:train:2601-2700batch: iter_time=1.074e-04, loss_ctc=75.416, loss_att=57.641, acc=0.723, loss=62.977, grad_norm=4.950, loss_scale=1.000, learning_rate=7.536e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:36:27,370 (deepspeed_trainer:228) INFO: 29epoch:train:2701-2800batch: iter_time=1.111e-04, loss_ctc=78.252, loss_att=56.722, acc=0.700, loss=63.184, grad_norm=5.789, loss_scale=1.000, learning_rate=7.535e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:37:00,232 (deepspeed_trainer:228) INFO: 29epoch:train:2801-2900batch: iter_time=1.079e-04, loss_ctc=66.176, loss_att=54.016, acc=0.717, loss=57.657, grad_norm=4.819, loss_scale=1.000, learning_rate=7.534e-05, step_time=0.328 [2024-12-07 23:37:33,266] [INFO] [logging.py:129:log_dist] [Rank 0] step=423000, skipped=0, lr=[np.float64(7.532426867974498e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:37:33,266] [INFO] [timer.py:264:stop] epoch=0/micro_step=33000/global_step=33000, RunningAvgSamplesPerSec=49.31010179830478, CurrSamplesPerSec=47.77828583057959, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-07 23:37:33,269 (deepspeed_trainer:228) INFO: 29epoch:train:2901-3000batch: iter_time=1.066e-04, loss_ctc=73.641, loss_att=53.840, acc=0.720, loss=59.780, grad_norm=5.006, loss_scale=1.000, learning_rate=7.533e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:38:06,475 (deepspeed_trainer:228) INFO: 29epoch:train:3001-3100batch: iter_time=1.112e-04, loss_ctc=72.909, loss_att=53.438, acc=0.720, loss=59.279, grad_norm=4.908, loss_scale=1.000, learning_rate=7.532e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:38:39,877 (deepspeed_trainer:228) INFO: 29epoch:train:3101-3200batch: iter_time=1.133e-04, loss_ctc=69.304, loss_att=57.316, acc=0.721, loss=60.928, grad_norm=4.482, loss_scale=1.000, learning_rate=7.531e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-07 23:39:13,182 (deepspeed_trainer:228) INFO: 29epoch:train:3201-3300batch: iter_time=1.118e-04, loss_ctc=78.039, loss_att=58.126, acc=0.697, loss=64.088, grad_norm=6.320, loss_scale=1.000, learning_rate=7.530e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:39:46,427 (deepspeed_trainer:228) INFO: 29epoch:train:3301-3400batch: iter_time=1.114e-04, loss_ctc=71.842, loss_att=54.508, acc=0.714, loss=59.703, grad_norm=5.478, loss_scale=1.000, learning_rate=7.529e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:40:19,556 (deepspeed_trainer:228) INFO: 29epoch:train:3401-3500batch: iter_time=1.142e-04, loss_ctc=79.706, loss_att=56.508, acc=0.724, loss=63.478, grad_norm=5.709, loss_scale=1.000, learning_rate=7.528e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:40:52,500 (deepspeed_trainer:228) INFO: 29epoch:train:3501-3600batch: iter_time=1.148e-04, loss_ctc=71.841, loss_att=54.509, acc=0.726, loss=59.722, grad_norm=5.028, loss_scale=1.000, learning_rate=7.528e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:41:25,330 (deepspeed_trainer:228) INFO: 29epoch:train:3601-3700batch: iter_time=1.125e-04, loss_ctc=70.328, loss_att=55.117, acc=0.717, loss=59.688, grad_norm=5.419, loss_scale=1.000, learning_rate=7.527e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:41:46,552 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-07 23:42:12,904 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 23:42:29,269 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 23:42:29,269 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 23:42:29,272 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 23:42:53,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:53,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:54,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:55,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:55,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:55,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:55,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:57,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:57,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:57,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:57,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:58,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:58,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:58,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:42:58,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:00,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:41,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:42,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:42,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:43,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:43,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:43,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:45,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:45,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:46,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:47,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:47,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:47,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:47,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:47,742] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:48,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:43:51,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:28,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:29,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:30,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:30,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:30,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:33,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:35,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:35,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:35,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:35,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:36,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:36,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:36,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:36,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:37,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:44:43,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:16,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:17,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:17,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:17,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:18,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:21,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:23,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:24,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:24,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:24,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:25,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:25,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:25,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:25,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:26,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:45:33,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-07 23:46:16,028 (deepspeed_trainer:228) INFO: 29epoch:train:3701-3800batch: iter_time=2.504, loss_ctc=82.647, loss_att=63.752, acc=0.709, loss=69.384, grad_norm=5.755, loss_scale=1.000, learning_rate=7.526e-05, step_time=0.403 [cnode7-012:0/16] 2024-12-07 23:46:49,704 (deepspeed_trainer:228) INFO: 29epoch:train:3801-3900batch: iter_time=1.092e-04, loss_ctc=68.420, loss_att=54.473, acc=0.715, loss=58.662, grad_norm=5.059, loss_scale=1.000, learning_rate=7.525e-05, step_time=0.337 [2024-12-07 23:47:23,018] [INFO] [logging.py:129:log_dist] [Rank 0] step=424000, skipped=0, lr=[np.float64(7.523539066540063e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:47:23,019] [INFO] [timer.py:264:stop] epoch=0/micro_step=34000/global_step=34000, RunningAvgSamplesPerSec=49.28806751260649, CurrSamplesPerSec=49.78276091479179, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-07 23:47:23,021 (deepspeed_trainer:228) INFO: 29epoch:train:3901-4000batch: iter_time=1.037e-04, loss_ctc=77.332, loss_att=56.649, acc=0.729, loss=62.852, grad_norm=4.943, loss_scale=1.000, learning_rate=7.524e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-07 23:47:56,221 (deepspeed_trainer:228) INFO: 29epoch:train:4001-4100batch: iter_time=1.058e-04, loss_ctc=74.678, loss_att=57.932, acc=0.707, loss=62.983, grad_norm=5.577, loss_scale=1.000, learning_rate=7.523e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:48:29,464 (deepspeed_trainer:228) INFO: 29epoch:train:4101-4200batch: iter_time=1.060e-04, loss_ctc=84.207, loss_att=65.279, acc=0.706, loss=70.936, grad_norm=6.591, loss_scale=1.000, learning_rate=7.522e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:49:02,348 (deepspeed_trainer:228) INFO: 29epoch:train:4201-4300batch: iter_time=1.072e-04, loss_ctc=65.468, loss_att=46.582, acc=0.710, loss=52.258, grad_norm=5.899, loss_scale=1.000, learning_rate=7.521e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-07 23:49:35,260 (deepspeed_trainer:228) INFO: 29epoch:train:4301-4400batch: iter_time=1.065e-04, loss_ctc=70.364, loss_att=53.597, acc=0.707, loss=58.610, grad_norm=5.257, loss_scale=1.000, learning_rate=7.520e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:50:08,437 (deepspeed_trainer:228) INFO: 29epoch:train:4401-4500batch: iter_time=1.064e-04, loss_ctc=77.484, loss_att=56.692, acc=0.721, loss=62.949, grad_norm=5.042, loss_scale=1.000, learning_rate=7.520e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:50:41,520 (deepspeed_trainer:228) INFO: 29epoch:train:4501-4600batch: iter_time=1.079e-04, loss_ctc=74.755, loss_att=59.670, acc=0.709, loss=64.186, grad_norm=5.122, loss_scale=1.000, learning_rate=7.519e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:51:14,464 (deepspeed_trainer:228) INFO: 29epoch:train:4601-4700batch: iter_time=1.092e-04, loss_ctc=73.259, loss_att=50.453, acc=0.712, loss=57.276, grad_norm=5.466, loss_scale=1.000, learning_rate=7.518e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:51:47,390 (deepspeed_trainer:228) INFO: 29epoch:train:4701-4800batch: iter_time=1.065e-04, loss_ctc=66.661, loss_att=52.965, acc=0.708, loss=57.067, grad_norm=5.047, loss_scale=1.000, learning_rate=7.517e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:52:20,430 (deepspeed_trainer:228) INFO: 29epoch:train:4801-4900batch: iter_time=1.058e-04, loss_ctc=72.831, loss_att=55.152, acc=0.716, loss=60.480, grad_norm=5.086, loss_scale=1.000, learning_rate=7.516e-05, step_time=0.330 [2024-12-07 23:52:53,342] [INFO] [logging.py:129:log_dist] [Rank 0] step=425000, skipped=0, lr=[np.float64(7.514682652223874e-05)], mom=[[0.9, 0.98]] [2024-12-07 23:52:53,342] [INFO] [timer.py:264:stop] epoch=0/micro_step=35000/global_step=35000, RunningAvgSamplesPerSec=49.30517258978175, CurrSamplesPerSec=49.37905706939874, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-07 23:52:53,345 (deepspeed_trainer:228) INFO: 29epoch:train:4901-5000batch: iter_time=1.057e-04, loss_ctc=74.012, loss_att=57.228, acc=0.707, loss=62.271, grad_norm=5.002, loss_scale=1.000, learning_rate=7.515e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:53:27,409 (deepspeed_trainer:228) INFO: 29epoch:train:5001-5100batch: iter_time=1.093e-04, loss_ctc=72.841, loss_att=52.920, acc=0.723, loss=58.891, grad_norm=5.084, loss_scale=1.000, learning_rate=7.514e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-07 23:54:00,336 (deepspeed_trainer:228) INFO: 29epoch:train:5101-5200batch: iter_time=1.079e-04, loss_ctc=70.526, loss_att=50.666, acc=0.701, loss=56.628, grad_norm=6.166, loss_scale=1.000, learning_rate=7.513e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:54:33,539 (deepspeed_trainer:228) INFO: 29epoch:train:5201-5300batch: iter_time=1.070e-04, loss_ctc=75.306, loss_att=57.558, acc=0.711, loss=62.876, grad_norm=5.153, loss_scale=1.000, learning_rate=7.512e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-07 23:55:06,588 (deepspeed_trainer:228) INFO: 29epoch:train:5301-5400batch: iter_time=1.074e-04, loss_ctc=78.116, loss_att=53.979, acc=0.726, loss=61.213, grad_norm=5.287, loss_scale=1.000, learning_rate=7.512e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-07 23:55:39,789 (deepspeed_trainer:228) INFO: 29epoch:train:5401-5500batch: iter_time=1.081e-04, loss_ctc=76.349, loss_att=61.036, acc=0.720, loss=65.610, grad_norm=5.642, loss_scale=1.000, learning_rate=7.511e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-07 23:56:12,660 (deepspeed_trainer:228) INFO: 29epoch:train:5501-5600batch: iter_time=1.071e-04, loss_ctc=70.154, loss_att=52.661, acc=0.713, loss=57.917, grad_norm=5.525, loss_scale=1.000, learning_rate=7.510e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-07 23:56:25,575 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-07 23:56:51,448 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-07 23:57:08,327 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-07 23:57:08,327 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-07 23:57:08,329 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-07 23:57:33,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:34,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:33,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:34,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:34,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:35,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:37,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:37,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:37,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:38,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:38,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:38,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:38,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:38,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:39,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:57:38,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:21,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:22,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:23,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:23,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:24,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:24,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:27,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:27,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:28,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:28,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:29,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:29,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:29,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:29,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:29,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:58:31,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:10,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:10,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:12,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:12,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:13,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:16,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:16,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:17,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:18,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:18,603] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:19,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:18,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:19,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:19,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:19,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-07 23:59:23,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:00,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:01,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:01,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:02,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:02,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:07,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:08,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:08,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:09,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:09,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:09,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:09,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:10,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:10,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:10,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:00:26,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 00:01:15,777 (deepspeed_trainer:228) INFO: 29epoch:train:5601-5700batch: iter_time=2.542, loss_ctc=70.172, loss_att=54.210, acc=0.720, loss=58.990, grad_norm=4.961, loss_scale=1.000, learning_rate=7.509e-05, step_time=0.489 [cnode7-012:0/16] 2024-12-08 00:01:49,611 (deepspeed_trainer:228) INFO: 29epoch:train:5701-5800batch: iter_time=1.059e-04, loss_ctc=77.251, loss_att=63.444, acc=0.715, loss=67.612, grad_norm=5.511, loss_scale=1.000, learning_rate=7.508e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 00:02:22,866 (deepspeed_trainer:228) INFO: 29epoch:train:5801-5900batch: iter_time=1.091e-04, loss_ctc=72.562, loss_att=53.349, acc=0.728, loss=59.107, grad_norm=5.016, loss_scale=1.000, learning_rate=7.507e-05, step_time=0.332 [2024-12-08 00:02:56,001] [INFO] [logging.py:129:log_dist] [Rank 0] step=426000, skipped=0, lr=[np.float64(7.505857440721354e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:02:56,002] [INFO] [timer.py:264:stop] epoch=0/micro_step=36000/global_step=36000, RunningAvgSamplesPerSec=49.24755525595704, CurrSamplesPerSec=52.39260840162468, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:02:56,005 (deepspeed_trainer:228) INFO: 29epoch:train:5901-6000batch: iter_time=1.044e-04, loss_ctc=77.796, loss_att=64.630, acc=0.705, loss=68.581, grad_norm=6.461, loss_scale=1.000, learning_rate=7.506e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:03:28,928 (deepspeed_trainer:228) INFO: 29epoch:train:6001-6100batch: iter_time=1.077e-04, loss_ctc=77.678, loss_att=56.624, acc=0.716, loss=62.913, grad_norm=5.596, loss_scale=1.000, learning_rate=7.505e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 00:04:01,891 (deepspeed_trainer:228) INFO: 29epoch:train:6101-6200batch: iter_time=1.083e-04, loss_ctc=65.643, loss_att=48.053, acc=0.719, loss=53.317, grad_norm=6.091, loss_scale=1.000, learning_rate=7.505e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 00:04:35,139 (deepspeed_trainer:228) INFO: 29epoch:train:6201-6300batch: iter_time=1.093e-04, loss_ctc=72.654, loss_att=53.948, acc=0.706, loss=59.547, grad_norm=5.299, loss_scale=1.000, learning_rate=7.504e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 00:05:08,687 (deepspeed_trainer:228) INFO: 29epoch:train:6301-6400batch: iter_time=1.121e-04, loss_ctc=82.275, loss_att=64.062, acc=0.719, loss=69.503, grad_norm=5.557, loss_scale=1.000, learning_rate=7.503e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 00:05:41,890 (deepspeed_trainer:228) INFO: 29epoch:train:6401-6500batch: iter_time=1.073e-04, loss_ctc=68.524, loss_att=52.419, acc=0.718, loss=57.256, grad_norm=4.821, loss_scale=1.000, learning_rate=7.502e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 00:06:15,022 (deepspeed_trainer:228) INFO: 29epoch:train:6501-6600batch: iter_time=1.083e-04, loss_ctc=68.916, loss_att=47.623, acc=0.722, loss=54.026, grad_norm=5.231, loss_scale=1.000, learning_rate=7.501e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:06:48,108 (deepspeed_trainer:228) INFO: 29epoch:train:6601-6700batch: iter_time=1.054e-04, loss_ctc=74.532, loss_att=58.741, acc=0.709, loss=63.476, grad_norm=5.089, loss_scale=1.000, learning_rate=7.500e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:07:21,335 (deepspeed_trainer:228) INFO: 29epoch:train:6701-6800batch: iter_time=1.094e-04, loss_ctc=74.213, loss_att=55.580, acc=0.724, loss=61.167, grad_norm=5.181, loss_scale=1.000, learning_rate=7.499e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 00:07:54,629 (deepspeed_trainer:228) INFO: 29epoch:train:6801-6900batch: iter_time=1.086e-04, loss_ctc=68.509, loss_att=54.599, acc=0.725, loss=58.757, grad_norm=4.962, loss_scale=1.000, learning_rate=7.498e-05, step_time=0.333 [2024-12-08 00:08:27,693] [INFO] [logging.py:129:log_dist] [Rank 0] step=427000, skipped=0, lr=[np.float64(7.497063249239499e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:08:27,694] [INFO] [timer.py:264:stop] epoch=0/micro_step=37000/global_step=37000, RunningAvgSamplesPerSec=49.25925510283056, CurrSamplesPerSec=52.56756217069311, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:08:27,697 (deepspeed_trainer:228) INFO: 29epoch:train:6901-7000batch: iter_time=1.059e-04, loss_ctc=74.007, loss_att=55.456, acc=0.716, loss=61.021, grad_norm=5.634, loss_scale=1.000, learning_rate=7.497e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:09:01,232 (deepspeed_trainer:228) INFO: 29epoch:train:7001-7100batch: iter_time=1.086e-04, loss_ctc=73.324, loss_att=57.240, acc=0.701, loss=62.056, grad_norm=5.521, loss_scale=1.000, learning_rate=7.497e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 00:09:34,839 (deepspeed_trainer:228) INFO: 29epoch:train:7101-7200batch: iter_time=1.089e-04, loss_ctc=70.038, loss_att=49.920, acc=0.732, loss=55.952, grad_norm=4.603, loss_scale=1.000, learning_rate=7.496e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 00:10:07,988 (deepspeed_trainer:228) INFO: 29epoch:train:7201-7300batch: iter_time=1.077e-04, loss_ctc=81.512, loss_att=58.636, acc=0.725, loss=65.507, grad_norm=5.389, loss_scale=1.000, learning_rate=7.495e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:10:41,060 (deepspeed_trainer:228) INFO: 29epoch:train:7301-7400batch: iter_time=1.085e-04, loss_ctc=73.344, loss_att=58.823, acc=0.721, loss=63.179, grad_norm=5.092, loss_scale=1.000, learning_rate=7.494e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:11:14,125 (deepspeed_trainer:228) INFO: 29epoch:train:7401-7500batch: iter_time=1.103e-04, loss_ctc=71.001, loss_att=53.999, acc=0.718, loss=59.119, grad_norm=5.254, loss_scale=1.000, learning_rate=7.493e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:11:18,542 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 00:11:45,687 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 00:12:02,259 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 00:12:02,259 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 00:12:02,261 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 00:12:26,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:26,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:26,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:26,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:27,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:26,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:28,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:29,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:30,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:30,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:30,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:30,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:30,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:31,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:30,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:12:33,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:14,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:14,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:14,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:15,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:14,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:14,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:18,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:18,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:18,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:19,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:19,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:20,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:20,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:20,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:21,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:13:21,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:01,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:02,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:01,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:02,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:02,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:03,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:06,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:06,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:07,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:07,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:07,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:08,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:09,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:09,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:10,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:10,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:48,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:50,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:50,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:51,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:52,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:53,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:55,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:55,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:56,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:57,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:57,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:58,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:59,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:58,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:14:59,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:15:03,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 00:16:03,258 (deepspeed_trainer:228) INFO: 29epoch:train:7501-7600batch: iter_time=2.554, loss_ctc=71.925, loss_att=55.214, acc=0.716, loss=60.231, grad_norm=5.213, loss_scale=1.000, learning_rate=7.492e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 00:16:36,900 (deepspeed_trainer:228) INFO: 29epoch:train:7601-7700batch: iter_time=1.065e-04, loss_ctc=76.831, loss_att=62.012, acc=0.711, loss=66.472, grad_norm=5.194, loss_scale=1.000, learning_rate=7.491e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 00:17:10,677 (deepspeed_trainer:228) INFO: 29epoch:train:7701-7800batch: iter_time=1.094e-04, loss_ctc=70.239, loss_att=54.514, acc=0.728, loss=59.231, grad_norm=4.845, loss_scale=1.000, learning_rate=7.490e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 00:17:44,025 (deepspeed_trainer:228) INFO: 29epoch:train:7801-7900batch: iter_time=1.084e-04, loss_ctc=79.523, loss_att=60.346, acc=0.705, loss=66.115, grad_norm=6.180, loss_scale=1.000, learning_rate=7.490e-05, step_time=0.333 [2024-12-08 00:18:17,209] [INFO] [logging.py:129:log_dist] [Rank 0] step=428000, skipped=0, lr=[np.float64(7.48829989648098e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:18:17,210] [INFO] [timer.py:264:stop] epoch=0/micro_step=38000/global_step=38000, RunningAvgSamplesPerSec=49.26117763329684, CurrSamplesPerSec=51.697634128372634, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:18:17,212 (deepspeed_trainer:228) INFO: 29epoch:train:7901-8000batch: iter_time=1.081e-04, loss_ctc=78.073, loss_att=56.374, acc=0.712, loss=62.890, grad_norm=6.518, loss_scale=1.000, learning_rate=7.489e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 00:18:50,202 (deepspeed_trainer:228) INFO: 29epoch:train:8001-8100batch: iter_time=1.078e-04, loss_ctc=67.018, loss_att=53.515, acc=0.709, loss=57.585, grad_norm=5.653, loss_scale=1.000, learning_rate=7.488e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:19:23,018 (deepspeed_trainer:228) INFO: 29epoch:train:8101-8200batch: iter_time=1.093e-04, loss_ctc=68.377, loss_att=48.790, acc=0.718, loss=54.668, grad_norm=4.815, loss_scale=1.000, learning_rate=7.487e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 00:19:56,091 (deepspeed_trainer:228) INFO: 29epoch:train:8201-8300batch: iter_time=1.082e-04, loss_ctc=79.477, loss_att=61.796, acc=0.712, loss=67.091, grad_norm=5.519, loss_scale=1.000, learning_rate=7.486e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:20:28,908 (deepspeed_trainer:228) INFO: 29epoch:train:8301-8400batch: iter_time=1.089e-04, loss_ctc=71.948, loss_att=53.830, acc=0.708, loss=59.289, grad_norm=5.521, loss_scale=1.000, learning_rate=7.485e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 00:21:01,862 (deepspeed_trainer:228) INFO: 29epoch:train:8401-8500batch: iter_time=1.101e-04, loss_ctc=71.602, loss_att=53.916, acc=0.713, loss=59.238, grad_norm=5.702, loss_scale=1.000, learning_rate=7.484e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 00:21:34,678 (deepspeed_trainer:228) INFO: 29epoch:train:8501-8600batch: iter_time=1.081e-04, loss_ctc=72.723, loss_att=53.647, acc=0.715, loss=59.356, grad_norm=4.980, loss_scale=1.000, learning_rate=7.483e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 00:22:07,691 (deepspeed_trainer:228) INFO: 29epoch:train:8601-8700batch: iter_time=1.070e-04, loss_ctc=71.586, loss_att=51.664, acc=0.715, loss=57.660, grad_norm=4.854, loss_scale=1.000, learning_rate=7.483e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:22:40,608 (deepspeed_trainer:228) INFO: 29epoch:train:8701-8800batch: iter_time=1.069e-04, loss_ctc=64.059, loss_att=53.395, acc=0.720, loss=56.603, grad_norm=4.515, loss_scale=1.000, learning_rate=7.482e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 00:23:13,569 (deepspeed_trainer:228) INFO: 29epoch:train:8801-8900batch: iter_time=1.064e-04, loss_ctc=78.813, loss_att=56.774, acc=0.707, loss=63.351, grad_norm=5.504, loss_scale=1.000, learning_rate=7.481e-05, step_time=0.329 [2024-12-08 00:23:46,580] [INFO] [logging.py:129:log_dist] [Rank 0] step=429000, skipped=0, lr=[np.float64(7.479567202628445e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:23:46,581] [INFO] [timer.py:264:stop] epoch=0/micro_step=39000/global_step=39000, RunningAvgSamplesPerSec=49.281642040398445, CurrSamplesPerSec=48.99651044837358, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:23:46,584 (deepspeed_trainer:228) INFO: 29epoch:train:8901-9000batch: iter_time=1.066e-04, loss_ctc=73.166, loss_att=55.875, acc=0.707, loss=61.062, grad_norm=5.307, loss_scale=1.000, learning_rate=7.480e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:24:19,422 (deepspeed_trainer:228) INFO: 29epoch:train:9001-9100batch: iter_time=1.073e-04, loss_ctc=70.666, loss_att=49.774, acc=0.727, loss=56.057, grad_norm=4.873, loss_scale=1.000, learning_rate=7.479e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 00:24:52,439 (deepspeed_trainer:228) INFO: 29epoch:train:9101-9200batch: iter_time=1.070e-04, loss_ctc=74.714, loss_att=53.613, acc=0.723, loss=59.944, grad_norm=4.977, loss_scale=1.000, learning_rate=7.478e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:25:25,516 (deepspeed_trainer:228) INFO: 29epoch:train:9201-9300batch: iter_time=1.076e-04, loss_ctc=74.454, loss_att=58.753, acc=0.711, loss=63.459, grad_norm=5.269, loss_scale=1.000, learning_rate=7.477e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:25:55,190 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 00:26:22,267 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 00:26:39,424 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 00:26:39,425 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 00:26:39,427 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 00:27:02,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:02,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:03,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:03,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:04,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:04,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:04,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:05,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:05,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:06,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:06,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:07,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:08,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:08,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:08,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:08,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:49,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:50,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:51,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:51,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:52,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:54,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:55,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:55,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:55,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:55,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:55,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:57,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:57,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:58,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:58,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:27:58,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:34,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:36,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:38,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:39,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:42,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:43,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:44,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:44,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:44,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:44,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:45,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:47,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:47,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:47,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:47,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:28:48,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:21,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:22,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:24,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:26,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:31,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:32,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:32,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:32,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:33,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:33,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:34,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:35,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:35,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:35,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:36,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:29:39,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 00:30:15,226 (deepspeed_trainer:228) INFO: 29epoch:train:9301-9400batch: iter_time=2.539, loss_ctc=78.131, loss_att=61.886, acc=0.716, loss=66.751, grad_norm=5.955, loss_scale=1.000, learning_rate=7.477e-05, step_time=0.360 [cnode7-012:0/16] 2024-12-08 00:30:49,492 (deepspeed_trainer:228) INFO: 29epoch:train:9401-9500batch: iter_time=1.050e-04, loss_ctc=69.852, loss_att=51.794, acc=0.726, loss=57.200, grad_norm=4.924, loss_scale=1.000, learning_rate=7.476e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 00:31:23,629 (deepspeed_trainer:228) INFO: 29epoch:train:9501-9600batch: iter_time=1.113e-04, loss_ctc=73.855, loss_att=56.205, acc=0.725, loss=61.497, grad_norm=4.955, loss_scale=1.000, learning_rate=7.475e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 00:31:57,756 (deepspeed_trainer:228) INFO: 29epoch:train:9601-9700batch: iter_time=1.096e-04, loss_ctc=72.346, loss_att=58.126, acc=0.718, loss=62.413, grad_norm=5.466, loss_scale=1.000, learning_rate=7.474e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 00:32:32,031 (deepspeed_trainer:228) INFO: 29epoch:train:9701-9800batch: iter_time=1.155e-04, loss_ctc=82.611, loss_att=64.940, acc=0.708, loss=70.265, grad_norm=5.990, loss_scale=1.000, learning_rate=7.473e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 00:33:05,429 (deepspeed_trainer:228) INFO: 29epoch:train:9801-9900batch: iter_time=1.151e-04, loss_ctc=69.089, loss_att=47.975, acc=0.726, loss=54.314, grad_norm=5.863, loss_scale=1.000, learning_rate=7.472e-05, step_time=0.334 [2024-12-08 00:33:38,947] [INFO] [logging.py:129:log_dist] [Rank 0] step=430000, skipped=0, lr=[np.float64(7.470864989329022e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:33:38,948] [INFO] [timer.py:264:stop] epoch=0/micro_step=40000/global_step=40000, RunningAvgSamplesPerSec=49.26591502600129, CurrSamplesPerSec=50.830645034858975, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:33:38,951 (deepspeed_trainer:228) INFO: 29epoch:train:9901-10000batch: iter_time=1.180e-04, loss_ctc=69.559, loss_att=56.813, acc=0.705, loss=60.636, grad_norm=5.357, loss_scale=1.000, learning_rate=7.471e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 00:34:13,186 (deepspeed_trainer:228) INFO: 29epoch:train:10001-10100batch: iter_time=1.107e-04, loss_ctc=73.714, loss_att=54.307, acc=0.719, loss=60.148, grad_norm=4.874, loss_scale=1.000, learning_rate=7.470e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 00:34:46,794 (deepspeed_trainer:228) INFO: 29epoch:train:10101-10200batch: iter_time=1.098e-04, loss_ctc=74.486, loss_att=56.992, acc=0.725, loss=62.242, grad_norm=5.154, loss_scale=1.000, learning_rate=7.470e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 00:35:20,156 (deepspeed_trainer:228) INFO: 29epoch:train:10201-10300batch: iter_time=1.113e-04, loss_ctc=75.221, loss_att=54.699, acc=0.707, loss=60.869, grad_norm=5.660, loss_scale=1.000, learning_rate=7.469e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:35:53,457 (deepspeed_trainer:228) INFO: 29epoch:train:10301-10400batch: iter_time=1.080e-04, loss_ctc=65.481, loss_att=52.832, acc=0.721, loss=56.640, grad_norm=4.801, loss_scale=1.000, learning_rate=7.468e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:36:26,805 (deepspeed_trainer:228) INFO: 29epoch:train:10401-10500batch: iter_time=1.125e-04, loss_ctc=73.007, loss_att=53.379, acc=0.723, loss=59.264, grad_norm=5.204, loss_scale=1.000, learning_rate=7.467e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:37:00,228 (deepspeed_trainer:228) INFO: 29epoch:train:10501-10600batch: iter_time=1.095e-04, loss_ctc=72.349, loss_att=52.335, acc=0.726, loss=58.329, grad_norm=4.634, loss_scale=1.000, learning_rate=7.466e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 00:37:33,801 (deepspeed_trainer:228) INFO: 29epoch:train:10601-10700batch: iter_time=1.143e-04, loss_ctc=68.862, loss_att=56.681, acc=0.723, loss=60.337, grad_norm=4.428, loss_scale=1.000, learning_rate=7.465e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 00:38:07,360 (deepspeed_trainer:228) INFO: 29epoch:train:10701-10800batch: iter_time=1.082e-04, loss_ctc=75.964, loss_att=57.615, acc=0.704, loss=63.118, grad_norm=5.862, loss_scale=1.000, learning_rate=7.464e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 00:38:41,044 (deepspeed_trainer:228) INFO: 29epoch:train:10801-10900batch: iter_time=1.123e-04, loss_ctc=71.372, loss_att=54.098, acc=0.719, loss=59.268, grad_norm=5.035, loss_scale=1.000, learning_rate=7.463e-05, step_time=0.336 [2024-12-08 00:39:14,541] [INFO] [logging.py:129:log_dist] [Rank 0] step=431000, skipped=0, lr=[np.float64(7.462193079679015e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:39:14,541] [INFO] [timer.py:264:stop] epoch=0/micro_step=41000/global_step=41000, RunningAvgSamplesPerSec=49.26447996354655, CurrSamplesPerSec=47.749321901179606, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:39:14,544 (deepspeed_trainer:228) INFO: 29epoch:train:10901-11000batch: iter_time=1.104e-04, loss_ctc=78.931, loss_att=55.380, acc=0.729, loss=62.453, grad_norm=5.308, loss_scale=1.000, learning_rate=7.463e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 00:39:47,734 (deepspeed_trainer:228) INFO: 29epoch:train:11001-11100batch: iter_time=1.096e-04, loss_ctc=70.804, loss_att=53.667, acc=0.731, loss=58.820, grad_norm=5.254, loss_scale=1.000, learning_rate=7.462e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 00:40:21,036 (deepspeed_trainer:228) INFO: 29epoch:train:11101-11200batch: iter_time=1.174e-04, loss_ctc=69.464, loss_att=53.771, acc=0.722, loss=58.488, grad_norm=5.620, loss_scale=1.000, learning_rate=7.461e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:40:42,248 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 00:41:08,710 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 00:41:24,846 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 00:41:24,846 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 00:41:24,848 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 00:41:49,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:51,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:51,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:51,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:51,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:51,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:51,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:52,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:52,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:54,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:54,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:54,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:54,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:55,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:56,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:41:56,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:38,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:38,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:39,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:39,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:39,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:40,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:41,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:41,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:42,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:42,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:43,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:44,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:44,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:44,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:44,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:42:45,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:25,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:27,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:27,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:27,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:27,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:28,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:30,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:30,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:30,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:31,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:32,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:33,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:33,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:34,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:34,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:43:35,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:12,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:14,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:14,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:15,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:15,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:17,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:18,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:18,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:19,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:19,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:20,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:23,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:23,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:24,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:24,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:44:25,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 00:45:08,639 (deepspeed_trainer:228) INFO: 29epoch:train:11201-11300batch: iter_time=2.527, loss_ctc=79.929, loss_att=62.708, acc=0.713, loss=67.893, grad_norm=5.303, loss_scale=1.000, learning_rate=7.460e-05, step_time=0.348 [cnode7-012:0/16] 2024-12-08 00:45:42,393 (deepspeed_trainer:228) INFO: 29epoch:train:11301-11400batch: iter_time=1.164e-04, loss_ctc=66.985, loss_att=53.854, acc=0.719, loss=57.785, grad_norm=4.987, loss_scale=1.000, learning_rate=7.459e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 00:46:16,180 (deepspeed_trainer:228) INFO: 29epoch:train:11401-11500batch: iter_time=1.114e-04, loss_ctc=76.469, loss_att=55.932, acc=0.731, loss=62.092, grad_norm=5.106, loss_scale=1.000, learning_rate=7.458e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 00:46:49,511 (deepspeed_trainer:228) INFO: 29epoch:train:11501-11600batch: iter_time=1.105e-04, loss_ctc=73.946, loss_att=57.028, acc=0.711, loss=62.095, grad_norm=5.740, loss_scale=1.000, learning_rate=7.457e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:47:22,959 (deepspeed_trainer:228) INFO: 29epoch:train:11601-11700batch: iter_time=1.121e-04, loss_ctc=82.340, loss_att=65.755, acc=0.707, loss=70.729, grad_norm=6.163, loss_scale=1.000, learning_rate=7.457e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 00:47:55,981 (deepspeed_trainer:228) INFO: 29epoch:train:11701-11800batch: iter_time=1.138e-04, loss_ctc=64.359, loss_att=45.564, acc=0.715, loss=51.194, grad_norm=5.359, loss_scale=1.000, learning_rate=7.456e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:48:29,318 (deepspeed_trainer:228) INFO: 29epoch:train:11801-11900batch: iter_time=1.091e-04, loss_ctc=68.864, loss_att=52.695, acc=0.712, loss=57.545, grad_norm=5.247, loss_scale=1.000, learning_rate=7.455e-05, step_time=0.333 [2024-12-08 00:49:02,808] [INFO] [logging.py:129:log_dist] [Rank 0] step=432000, skipped=0, lr=[np.float64(7.453551298208807e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:49:02,808] [INFO] [timer.py:264:stop] epoch=0/micro_step=42000/global_step=42000, RunningAvgSamplesPerSec=49.26396663423569, CurrSamplesPerSec=47.01089714910201, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:49:02,811 (deepspeed_trainer:228) INFO: 29epoch:train:11901-12000batch: iter_time=1.109e-04, loss_ctc=77.274, loss_att=56.481, acc=0.723, loss=62.730, grad_norm=4.942, loss_scale=1.000, learning_rate=7.454e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 00:49:36,270 (deepspeed_trainer:228) INFO: 29epoch:train:12001-12100batch: iter_time=1.129e-04, loss_ctc=74.362, loss_att=58.404, acc=0.712, loss=63.239, grad_norm=5.131, loss_scale=1.000, learning_rate=7.453e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 00:50:09,627 (deepspeed_trainer:228) INFO: 29epoch:train:12101-12200batch: iter_time=1.115e-04, loss_ctc=72.628, loss_att=49.814, acc=0.714, loss=56.660, grad_norm=5.246, loss_scale=1.000, learning_rate=7.452e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:50:42,964 (deepspeed_trainer:228) INFO: 29epoch:train:12201-12300batch: iter_time=1.100e-04, loss_ctc=66.492, loss_att=53.516, acc=0.709, loss=57.398, grad_norm=4.972, loss_scale=1.000, learning_rate=7.451e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:51:16,411 (deepspeed_trainer:228) INFO: 29epoch:train:12301-12400batch: iter_time=1.129e-04, loss_ctc=71.832, loss_att=53.574, acc=0.722, loss=59.037, grad_norm=4.857, loss_scale=1.000, learning_rate=7.451e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 00:51:49,458 (deepspeed_trainer:228) INFO: 29epoch:train:12401-12500batch: iter_time=1.105e-04, loss_ctc=73.494, loss_att=56.886, acc=0.711, loss=61.873, grad_norm=4.913, loss_scale=1.000, learning_rate=7.450e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 00:52:22,360 (deepspeed_trainer:228) INFO: 29epoch:train:12501-12600batch: iter_time=1.074e-04, loss_ctc=72.062, loss_att=52.663, acc=0.725, loss=58.481, grad_norm=4.841, loss_scale=1.000, learning_rate=7.449e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 00:52:55,248 (deepspeed_trainer:228) INFO: 29epoch:train:12601-12700batch: iter_time=1.091e-04, loss_ctc=69.201, loss_att=49.778, acc=0.706, loss=55.627, grad_norm=5.769, loss_scale=1.000, learning_rate=7.448e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 00:53:28,536 (deepspeed_trainer:228) INFO: 29epoch:train:12701-12800batch: iter_time=1.115e-04, loss_ctc=74.324, loss_att=56.023, acc=0.717, loss=61.531, grad_norm=5.576, loss_scale=1.000, learning_rate=7.447e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 00:54:02,460 (deepspeed_trainer:228) INFO: 29epoch:train:12801-12900batch: iter_time=1.088e-04, loss_ctc=76.964, loss_att=52.793, acc=0.729, loss=60.048, grad_norm=5.144, loss_scale=1.000, learning_rate=7.446e-05, step_time=0.339 [2024-12-08 00:54:35,630] [INFO] [logging.py:129:log_dist] [Rank 0] step=433000, skipped=0, lr=[np.float64(7.444939470867946e-05)], mom=[[0.9, 0.98]] [2024-12-08 00:54:35,630] [INFO] [timer.py:264:stop] epoch=0/micro_step=43000/global_step=43000, RunningAvgSamplesPerSec=49.272900598691834, CurrSamplesPerSec=46.98854699910753, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 00:54:35,633 (deepspeed_trainer:228) INFO: 29epoch:train:12901-13000batch: iter_time=1.120e-04, loss_ctc=74.667, loss_att=59.164, acc=0.724, loss=63.812, grad_norm=5.468, loss_scale=1.000, learning_rate=7.445e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:55:08,692 (deepspeed_trainer:228) INFO: 29epoch:train:13001-13100batch: iter_time=1.111e-04, loss_ctc=68.643, loss_att=51.830, acc=0.715, loss=56.848, grad_norm=5.124, loss_scale=1.000, learning_rate=7.445e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 00:55:21,170 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 00:55:47,690 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 00:56:05,333 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 00:56:05,333 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 00:56:05,335 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 00:56:28,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:29,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:30,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:30,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:30,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:31,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:31,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:31,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:32,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:32,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:32,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:33,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:34,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:34,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:34,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:56:34,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:16,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:17,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:17,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:18,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:19,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:20,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:20,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:21,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:21,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:21,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:22,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:23,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:23,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:24,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:24,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:57:24,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:02,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:03,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:04,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:06,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:08,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:08,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:09,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:09,522] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:09,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:11,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:11,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:12,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:12,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:13,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:13,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:14,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:49,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:50,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:51,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:55,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:57,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:57,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:58,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:58,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:59,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:58:59,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:59:00,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:59:01,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:59:01,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:59:02,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:59:02,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 00:59:02,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 00:59:55,515 (deepspeed_trainer:228) INFO: 29epoch:train:13101-13200batch: iter_time=2.521, loss_ctc=69.899, loss_att=53.940, acc=0.723, loss=58.720, grad_norm=5.100, loss_scale=1.000, learning_rate=7.444e-05, step_time=0.348 [cnode7-012:0/16] 2024-12-08 01:00:29,412 (deepspeed_trainer:228) INFO: 29epoch:train:13201-13300batch: iter_time=1.120e-04, loss_ctc=76.212, loss_att=63.343, acc=0.719, loss=67.192, grad_norm=5.090, loss_scale=1.000, learning_rate=7.443e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 01:01:02,914 (deepspeed_trainer:228) INFO: 29epoch:train:13301-13400batch: iter_time=1.085e-04, loss_ctc=72.450, loss_att=52.608, acc=0.730, loss=58.570, grad_norm=5.068, loss_scale=1.000, learning_rate=7.442e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:01:36,231 (deepspeed_trainer:228) INFO: 29epoch:train:13401-13500batch: iter_time=1.161e-04, loss_ctc=77.288, loss_att=64.446, acc=0.704, loss=68.265, grad_norm=6.858, loss_scale=1.000, learning_rate=7.441e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 01:02:10,010 (deepspeed_trainer:228) INFO: 29epoch:train:13501-13600batch: iter_time=1.088e-04, loss_ctc=76.669, loss_att=56.436, acc=0.720, loss=62.537, grad_norm=6.209, loss_scale=1.000, learning_rate=7.440e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 01:02:43,189 (deepspeed_trainer:228) INFO: 29epoch:train:13601-13700batch: iter_time=1.112e-04, loss_ctc=64.616, loss_att=46.499, acc=0.726, loss=51.940, grad_norm=5.428, loss_scale=1.000, learning_rate=7.439e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:03:16,307 (deepspeed_trainer:228) INFO: 29epoch:train:13701-13800batch: iter_time=1.106e-04, loss_ctc=71.176, loss_att=53.589, acc=0.711, loss=58.861, grad_norm=5.133, loss_scale=1.000, learning_rate=7.438e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 01:03:49,571 (deepspeed_trainer:228) INFO: 29epoch:train:13801-13900batch: iter_time=1.108e-04, loss_ctc=80.772, loss_att=62.962, acc=0.722, loss=68.285, grad_norm=5.382, loss_scale=1.000, learning_rate=7.438e-05, step_time=0.332 [2024-12-08 01:04:22,403] [INFO] [logging.py:129:log_dist] [Rank 0] step=434000, skipped=0, lr=[np.float64(7.436357425010413e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:04:22,404] [INFO] [timer.py:264:stop] epoch=0/micro_step=44000/global_step=44000, RunningAvgSamplesPerSec=49.273599262794754, CurrSamplesPerSec=52.03922967951594, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 01:04:22,406 (deepspeed_trainer:228) INFO: 29epoch:train:13901-14000batch: iter_time=1.102e-04, loss_ctc=68.233, loss_att=51.906, acc=0.718, loss=56.850, grad_norm=4.765, loss_scale=1.000, learning_rate=7.437e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 01:04:55,034 (deepspeed_trainer:228) INFO: 29epoch:train:14001-14100batch: iter_time=1.118e-04, loss_ctc=68.818, loss_att=48.237, acc=0.721, loss=54.392, grad_norm=5.017, loss_scale=1.000, learning_rate=7.436e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 01:05:28,145 (deepspeed_trainer:228) INFO: 29epoch:train:14101-14200batch: iter_time=1.104e-04, loss_ctc=73.571, loss_att=57.885, acc=0.711, loss=62.541, grad_norm=5.662, loss_scale=1.000, learning_rate=7.435e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 01:06:01,330 (deepspeed_trainer:228) INFO: 29epoch:train:14201-14300batch: iter_time=1.107e-04, loss_ctc=72.610, loss_att=55.029, acc=0.726, loss=60.289, grad_norm=4.980, loss_scale=1.000, learning_rate=7.434e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:06:34,387 (deepspeed_trainer:228) INFO: 29epoch:train:14301-14400batch: iter_time=1.100e-04, loss_ctc=68.006, loss_att=54.545, acc=0.725, loss=58.570, grad_norm=4.801, loss_scale=1.000, learning_rate=7.433e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:07:07,448 (deepspeed_trainer:228) INFO: 29epoch:train:14401-14500batch: iter_time=1.109e-04, loss_ctc=72.949, loss_att=55.117, acc=0.717, loss=60.472, grad_norm=5.585, loss_scale=1.000, learning_rate=7.433e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:07:40,635 (deepspeed_trainer:228) INFO: 29epoch:train:14501-14600batch: iter_time=1.125e-04, loss_ctc=71.913, loss_att=56.477, acc=0.705, loss=61.116, grad_norm=5.220, loss_scale=1.000, learning_rate=7.432e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 01:08:13,817 (deepspeed_trainer:228) INFO: 29epoch:train:14601-14700batch: iter_time=1.103e-04, loss_ctc=69.517, loss_att=49.968, acc=0.733, loss=55.817, grad_norm=5.413, loss_scale=1.000, learning_rate=7.431e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:08:47,215 (deepspeed_trainer:228) INFO: 29epoch:train:14701-14800batch: iter_time=1.143e-04, loss_ctc=80.904, loss_att=58.274, acc=0.727, loss=65.076, grad_norm=5.302, loss_scale=1.000, learning_rate=7.430e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 01:09:20,999 (deepspeed_trainer:228) INFO: 29epoch:train:14801-14900batch: iter_time=1.093e-04, loss_ctc=72.004, loss_att=57.457, acc=0.724, loss=61.859, grad_norm=5.142, loss_scale=1.000, learning_rate=7.429e-05, step_time=0.338 [2024-12-08 01:09:54,188] [INFO] [logging.py:129:log_dist] [Rank 0] step=435000, skipped=0, lr=[np.float64(7.427804989380097e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:09:54,188] [INFO] [timer.py:264:stop] epoch=0/micro_step=45000/global_step=45000, RunningAvgSamplesPerSec=49.28427948379184, CurrSamplesPerSec=48.54359816685995, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 01:09:54,191 (deepspeed_trainer:228) INFO: 29epoch:train:14901-15000batch: iter_time=1.079e-04, loss_ctc=70.039, loss_att=53.572, acc=0.721, loss=58.522, grad_norm=5.119, loss_scale=1.000, learning_rate=7.428e-05, step_time=0.332 [2024-12-08 01:10:09,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:09,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:11,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:11,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:11,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:11,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:11,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:11,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:10,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:24,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:24,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:26,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:25,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:26,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:25,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:27,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:26,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:27,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:27,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:27,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:27,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:27,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:28,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:28,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:28,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:39,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:41,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:41,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:42,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:42,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:42,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:42,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:42,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:43,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:43,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:43,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:44,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:44,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:44,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:45,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:45,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:53,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:56,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:56,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:57,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:57,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:58,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:59,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:58,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:59,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:59,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:10:59,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:11:00,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:11:00,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:11:01,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:11:01,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:11:01,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:11:14,161] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 29 is about to be saved! [2024-12-08 01:11:14,191] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/mp_rank_00_model_states.pt [2024-12-08 01:11:14,191] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/mp_rank_00_model_states.pt... [2024-12-08 01:11:16,097] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/mp_rank_00_model_states.pt. [2024-12-08 01:11:16,255] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,256] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,256] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,256] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,257] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,524] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,526] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,526] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,526] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,259] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,259] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,526] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,528] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,528] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,261] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 01:11:15,529] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 01:11:16,851] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,852] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,852] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,969] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,969] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,970] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,987] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,987] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,987] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,992] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 01:11:17,017] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 01:11:17,017] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:17,019] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 01:11:17,019] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 01:11:17,019] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:17,026] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 01:11:17,026] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 01:11:17,026] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:17,030] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 01:11:17,030] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 01:11:17,031] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:17,034] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 01:11:17,034] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 01:11:17,034] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [2024-12-08 01:11:16,659] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 01:11:16,660] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_29/29/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 01:11:16,660] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 29 is ready now! [cnode7-012:0/16] 2024-12-08 01:11:17,401 (deepspeed_trainer:158) INFO: 29epoch results: [train] iter_time=0.134, loss_ctc=73.514, loss_att=55.626, acc=0.716, loss=60.993, grad_norm=5.359, loss_scale=1.000, learning_rate=7.493e-05, step_time=0.335, time=1 hour, 57 minutes and 30.4 seconds, total_count=435029, gpu_max_cached_mem_GB=39.385, [valid] loss_ctc=4.156, cer_ctc=0.103, loss_att=6.531, acc=0.858, cer=0.159, wer=0.812, loss=5.812, time=1 minute and 10.16 seconds, total_count=29, gpu_max_cached_mem_GB=39.385 [cnode7-012:0/16] 2024-12-08 01:11:21,765 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 01:11:49,185 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 01:12:06,358 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 01:12:06,358 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 01:12:06,361 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 01:12:22,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:23,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:23,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:23,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:23,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:24,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:24,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:25,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:24,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:26,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:25,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:27,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:27,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:27,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:27,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:12:31,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:08,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:10,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:10,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:10,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:11,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:12,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:12,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:13,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:13,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:14,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:14,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:16,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:16,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:17,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:17,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:19,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:56,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:57,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:57,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:58,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:59,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:13:59,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:00,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:01,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:02,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:03,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:02,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:04,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:05,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:06,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:06,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:08,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:42,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:43,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:44,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:46,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:46,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:47,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:48,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:50,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:50,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:51,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:51,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:54,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:55,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:55,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:56,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:14:57,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 01:15:57,085 (deepspeed_trainer:228) INFO: 30epoch:train:1-100batch: iter_time=2.411, loss_ctc=68.938, loss_att=59.300, acc=0.707, loss=62.205, grad_norm=5.337, loss_scale=1.000, learning_rate=7.427e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 01:16:30,276 (deepspeed_trainer:228) INFO: 30epoch:train:101-200batch: iter_time=1.172e-04, loss_ctc=73.416, loss_att=52.108, acc=0.726, loss=58.486, grad_norm=5.283, loss_scale=1.000, learning_rate=7.427e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 01:17:03,874 (deepspeed_trainer:228) INFO: 30epoch:train:201-300batch: iter_time=1.194e-04, loss_ctc=75.944, loss_att=53.099, acc=0.722, loss=59.929, grad_norm=5.330, loss_scale=1.000, learning_rate=7.426e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 01:17:37,000 (deepspeed_trainer:228) INFO: 30epoch:train:301-400batch: iter_time=1.185e-04, loss_ctc=67.993, loss_att=48.096, acc=0.716, loss=54.060, grad_norm=5.044, loss_scale=1.000, learning_rate=7.425e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 01:18:10,252 (deepspeed_trainer:228) INFO: 30epoch:train:401-500batch: iter_time=1.220e-04, loss_ctc=71.147, loss_att=54.210, acc=0.709, loss=59.300, grad_norm=5.144, loss_scale=1.000, learning_rate=7.424e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:18:43,631 (deepspeed_trainer:228) INFO: 30epoch:train:501-600batch: iter_time=1.191e-04, loss_ctc=71.746, loss_att=50.593, acc=0.723, loss=56.929, grad_norm=5.459, loss_scale=1.000, learning_rate=7.423e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 01:19:17,064 (deepspeed_trainer:228) INFO: 30epoch:train:601-700batch: iter_time=1.163e-04, loss_ctc=78.049, loss_att=59.667, acc=0.694, loss=65.198, grad_norm=5.435, loss_scale=1.000, learning_rate=7.422e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 01:19:50,791 (deepspeed_trainer:228) INFO: 30epoch:train:701-800batch: iter_time=1.222e-04, loss_ctc=74.611, loss_att=57.456, acc=0.700, loss=62.632, grad_norm=5.366, loss_scale=1.000, learning_rate=7.421e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 01:20:24,313 (deepspeed_trainer:228) INFO: 30epoch:train:801-900batch: iter_time=1.200e-04, loss_ctc=66.246, loss_att=51.102, acc=0.705, loss=55.678, grad_norm=4.687, loss_scale=1.000, learning_rate=7.421e-05, step_time=0.335 [2024-12-08 01:20:57,890] [INFO] [logging.py:129:log_dist] [Rank 0] step=436000, skipped=0, lr=[np.float64(7.419281994096438e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:20:57,890] [INFO] [timer.py:264:stop] epoch=0/micro_step=46000/global_step=46000, RunningAvgSamplesPerSec=49.2837698498018, CurrSamplesPerSec=51.31878297126007, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 01:20:57,893 (deepspeed_trainer:228) INFO: 30epoch:train:901-1000batch: iter_time=1.174e-04, loss_ctc=72.801, loss_att=55.017, acc=0.716, loss=60.346, grad_norm=4.872, loss_scale=1.000, learning_rate=7.420e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:21:31,128 (deepspeed_trainer:228) INFO: 30epoch:train:1001-1100batch: iter_time=1.156e-04, loss_ctc=73.234, loss_att=54.400, acc=0.716, loss=60.039, grad_norm=5.287, loss_scale=1.000, learning_rate=7.419e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:22:04,398 (deepspeed_trainer:228) INFO: 30epoch:train:1101-1200batch: iter_time=1.130e-04, loss_ctc=76.451, loss_att=58.197, acc=0.704, loss=63.690, grad_norm=5.137, loss_scale=1.000, learning_rate=7.418e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:22:38,022 (deepspeed_trainer:228) INFO: 30epoch:train:1201-1300batch: iter_time=1.935e-04, loss_ctc=77.079, loss_att=62.234, acc=0.709, loss=66.698, grad_norm=5.390, loss_scale=1.000, learning_rate=7.417e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 01:23:11,954 (deepspeed_trainer:228) INFO: 30epoch:train:1301-1400batch: iter_time=1.150e-04, loss_ctc=67.117, loss_att=49.223, acc=0.717, loss=54.595, grad_norm=4.968, loss_scale=1.000, learning_rate=7.416e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 01:23:45,627 (deepspeed_trainer:228) INFO: 30epoch:train:1401-1500batch: iter_time=1.130e-04, loss_ctc=74.827, loss_att=56.368, acc=0.709, loss=61.927, grad_norm=5.417, loss_scale=1.000, learning_rate=7.415e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 01:24:19,300 (deepspeed_trainer:228) INFO: 30epoch:train:1501-1600batch: iter_time=1.152e-04, loss_ctc=73.909, loss_att=55.268, acc=0.709, loss=60.870, grad_norm=5.410, loss_scale=1.000, learning_rate=7.415e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 01:24:53,155 (deepspeed_trainer:228) INFO: 30epoch:train:1601-1700batch: iter_time=1.189e-04, loss_ctc=73.965, loss_att=52.233, acc=0.722, loss=58.757, grad_norm=4.807, loss_scale=1.000, learning_rate=7.414e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 01:25:26,504 (deepspeed_trainer:228) INFO: 30epoch:train:1701-1800batch: iter_time=1.225e-04, loss_ctc=70.800, loss_att=52.170, acc=0.708, loss=57.752, grad_norm=5.242, loss_scale=1.000, learning_rate=7.413e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 01:25:56,558 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 01:26:22,608 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 01:26:40,195 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 01:26:40,196 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 01:26:40,198 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 01:27:03,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:03,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:04,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:06,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:06,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:06,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:06,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:06,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:08,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:07,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:07,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:08,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:10,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:10,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:09,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:10,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:52,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:53,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:53,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:56,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:57,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:57,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:57,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:57,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:58,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:59,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:58,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:59,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:00,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:00,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:00,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:27:59,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:39,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:40,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:40,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:45,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:45,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:45,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:45,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:46,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:48,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:47,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:48,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:48,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:47,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:48,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:49,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:28:50,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:28,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:29,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:29,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:33,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:33,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:34,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:35,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:36,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:37,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:38,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:38,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:38,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:38,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:38,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:39,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:29:40,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 01:30:15,939 (deepspeed_trainer:228) INFO: 30epoch:train:1801-1900batch: iter_time=2.539, loss_ctc=66.714, loss_att=52.368, acc=0.703, loss=56.665, grad_norm=5.414, loss_scale=1.000, learning_rate=7.412e-05, step_time=0.356 [2024-12-08 01:30:49,774] [INFO] [logging.py:129:log_dist] [Rank 0] step=437000, skipped=0, lr=[np.float64(7.410788270640247e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:30:49,776] [INFO] [timer.py:264:stop] epoch=0/micro_step=47000/global_step=47000, RunningAvgSamplesPerSec=49.27399316586443, CurrSamplesPerSec=49.85117510958181, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 01:30:49,779 (deepspeed_trainer:228) INFO: 30epoch:train:1901-2000batch: iter_time=1.144e-04, loss_ctc=76.249, loss_att=58.992, acc=0.717, loss=64.173, grad_norm=5.039, loss_scale=1.000, learning_rate=7.411e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 01:31:23,195 (deepspeed_trainer:228) INFO: 30epoch:train:2001-2100batch: iter_time=1.150e-04, loss_ctc=68.634, loss_att=50.225, acc=0.733, loss=55.737, grad_norm=4.761, loss_scale=1.000, learning_rate=7.410e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 01:31:56,898 (deepspeed_trainer:228) INFO: 30epoch:train:2101-2200batch: iter_time=1.108e-04, loss_ctc=75.032, loss_att=50.890, acc=0.725, loss=58.124, grad_norm=5.014, loss_scale=1.000, learning_rate=7.410e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 01:32:29,024 (deepspeed_trainer:228) INFO: 30epoch:train:2201-2300batch: iter_time=1.145e-04, loss_ctc=67.113, loss_att=49.495, acc=0.715, loss=54.783, grad_norm=5.213, loss_scale=1.000, learning_rate=7.409e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:33:02,812 (deepspeed_trainer:228) INFO: 30epoch:train:2301-2400batch: iter_time=1.126e-04, loss_ctc=71.878, loss_att=54.868, acc=0.717, loss=59.982, grad_norm=5.377, loss_scale=1.000, learning_rate=7.408e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 01:33:36,035 (deepspeed_trainer:228) INFO: 30epoch:train:2401-2500batch: iter_time=1.112e-04, loss_ctc=72.682, loss_att=50.350, acc=0.715, loss=57.068, grad_norm=4.752, loss_scale=1.000, learning_rate=7.407e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:34:09,570 (deepspeed_trainer:228) INFO: 30epoch:train:2501-2600batch: iter_time=1.125e-04, loss_ctc=76.455, loss_att=62.267, acc=0.695, loss=66.548, grad_norm=5.377, loss_scale=1.000, learning_rate=7.406e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:34:43,121 (deepspeed_trainer:228) INFO: 30epoch:train:2601-2700batch: iter_time=1.141e-04, loss_ctc=73.531, loss_att=56.472, acc=0.706, loss=61.625, grad_norm=5.255, loss_scale=1.000, learning_rate=7.405e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:35:16,739 (deepspeed_trainer:228) INFO: 30epoch:train:2701-2800batch: iter_time=1.116e-04, loss_ctc=65.860, loss_att=47.936, acc=0.714, loss=53.316, grad_norm=5.015, loss_scale=1.000, learning_rate=7.404e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 01:35:50,857 (deepspeed_trainer:228) INFO: 30epoch:train:2801-2900batch: iter_time=1.147e-04, loss_ctc=74.267, loss_att=56.752, acc=0.712, loss=62.017, grad_norm=5.665, loss_scale=1.000, learning_rate=7.404e-05, step_time=0.341 [2024-12-08 01:36:23,698] [INFO] [logging.py:129:log_dist] [Rank 0] step=438000, skipped=0, lr=[np.float64(7.40232365183972e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:36:23,699] [INFO] [timer.py:264:stop] epoch=0/micro_step=48000/global_step=48000, RunningAvgSamplesPerSec=49.27782115983287, CurrSamplesPerSec=49.607263113040375, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 01:36:23,701 (deepspeed_trainer:228) INFO: 30epoch:train:2901-3000batch: iter_time=1.102e-04, loss_ctc=71.139, loss_att=52.529, acc=0.722, loss=58.102, grad_norm=4.935, loss_scale=1.000, learning_rate=7.403e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 01:36:56,782 (deepspeed_trainer:228) INFO: 30epoch:train:3001-3100batch: iter_time=1.106e-04, loss_ctc=77.807, loss_att=57.125, acc=0.711, loss=63.344, grad_norm=5.509, loss_scale=1.000, learning_rate=7.402e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:37:29,631 (deepspeed_trainer:228) INFO: 30epoch:train:3101-3200batch: iter_time=1.114e-04, loss_ctc=69.451, loss_att=56.608, acc=0.716, loss=60.440, grad_norm=4.874, loss_scale=1.000, learning_rate=7.401e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 01:38:02,652 (deepspeed_trainer:228) INFO: 30epoch:train:3201-3300batch: iter_time=1.127e-04, loss_ctc=72.102, loss_att=51.503, acc=0.720, loss=57.685, grad_norm=4.886, loss_scale=1.000, learning_rate=7.400e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:38:35,680 (deepspeed_trainer:228) INFO: 30epoch:train:3301-3400batch: iter_time=1.132e-04, loss_ctc=73.303, loss_att=54.277, acc=0.714, loss=60.007, grad_norm=5.600, loss_scale=1.000, learning_rate=7.399e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:39:08,974 (deepspeed_trainer:228) INFO: 30epoch:train:3401-3500batch: iter_time=1.174e-04, loss_ctc=78.933, loss_att=57.796, acc=0.709, loss=64.134, grad_norm=5.625, loss_scale=1.000, learning_rate=7.399e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:39:42,381 (deepspeed_trainer:228) INFO: 30epoch:train:3501-3600batch: iter_time=1.131e-04, loss_ctc=68.997, loss_att=48.011, acc=0.723, loss=54.313, grad_norm=5.281, loss_scale=1.000, learning_rate=7.398e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 01:40:15,734 (deepspeed_trainer:228) INFO: 30epoch:train:3601-3700batch: iter_time=1.151e-04, loss_ctc=72.454, loss_att=55.451, acc=0.704, loss=60.554, grad_norm=5.687, loss_scale=1.000, learning_rate=7.397e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 01:40:36,921 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 01:41:03,388 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 01:41:18,967 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 01:41:18,967 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 01:41:18,969 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 01:41:46,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:44,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:45,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:47,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:45,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:48,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:47,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:47,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:48,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:51,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:52,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:49,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:52,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:52,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:52,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:41:50,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:33,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:35,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:33,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:36,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:33,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:34,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:36,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:37,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:40,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:38,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:40,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:40,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:38,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:41,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:41,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:42:39,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:20,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:22,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:21,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:21,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:22,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:27,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:28,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:29,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:29,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:27,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:30,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:27,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:28,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:28,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:33,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:43:31,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:07,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:09,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:08,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:09,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:10,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:16,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:18,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:15,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:19,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:16,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:19,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:19,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:16,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:16,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:22,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:44:23,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 01:45:05,549 (deepspeed_trainer:228) INFO: 30epoch:train:3701-3800batch: iter_time=2.509, loss_ctc=62.843, loss_att=47.957, acc=0.716, loss=52.397, grad_norm=4.979, loss_scale=1.000, learning_rate=7.396e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-08 01:45:39,105 (deepspeed_trainer:228) INFO: 30epoch:train:3801-3900batch: iter_time=1.177e-04, loss_ctc=74.541, loss_att=59.503, acc=0.714, loss=64.026, grad_norm=5.344, loss_scale=1.000, learning_rate=7.395e-05, step_time=0.335 [2024-12-08 01:46:12,084] [INFO] [logging.py:129:log_dist] [Rank 0] step=439000, skipped=0, lr=[np.float64(7.393887971856618e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:46:12,085] [INFO] [timer.py:264:stop] epoch=0/micro_step=49000/global_step=49000, RunningAvgSamplesPerSec=49.2706896547626, CurrSamplesPerSec=52.85468511909501, MemAllocated=2.04GB, MaxMemAllocated=29.06GB [cnode7-012:0/16] 2024-12-08 01:46:12,087 (deepspeed_trainer:228) INFO: 30epoch:train:3901-4000batch: iter_time=1.183e-04, loss_ctc=73.522, loss_att=51.109, acc=0.735, loss=57.849, grad_norm=4.626, loss_scale=1.000, learning_rate=7.394e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:46:45,056 (deepspeed_trainer:228) INFO: 30epoch:train:4001-4100batch: iter_time=1.171e-04, loss_ctc=70.780, loss_att=50.840, acc=0.723, loss=56.833, grad_norm=5.187, loss_scale=1.000, learning_rate=7.393e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 01:47:17,696 (deepspeed_trainer:228) INFO: 30epoch:train:4101-4200batch: iter_time=1.184e-04, loss_ctc=65.425, loss_att=47.953, acc=0.715, loss=53.196, grad_norm=4.668, loss_scale=1.000, learning_rate=7.393e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 01:47:51,134 (deepspeed_trainer:228) INFO: 30epoch:train:4201-4300batch: iter_time=1.201e-04, loss_ctc=78.851, loss_att=58.681, acc=0.713, loss=64.733, grad_norm=5.252, loss_scale=1.000, learning_rate=7.392e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 01:48:24,171 (deepspeed_trainer:228) INFO: 30epoch:train:4301-4400batch: iter_time=1.168e-04, loss_ctc=71.096, loss_att=47.835, acc=0.721, loss=54.789, grad_norm=4.671, loss_scale=1.000, learning_rate=7.391e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:48:57,254 (deepspeed_trainer:228) INFO: 30epoch:train:4401-4500batch: iter_time=1.156e-04, loss_ctc=74.900, loss_att=62.208, acc=0.695, loss=66.023, grad_norm=5.689, loss_scale=1.000, learning_rate=7.390e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 01:49:30,805 (deepspeed_trainer:228) INFO: 30epoch:train:4501-4600batch: iter_time=1.173e-04, loss_ctc=68.257, loss_att=53.819, acc=0.713, loss=58.147, grad_norm=5.373, loss_scale=1.000, learning_rate=7.389e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:50:04,023 (deepspeed_trainer:228) INFO: 30epoch:train:4601-4700batch: iter_time=1.179e-04, loss_ctc=69.681, loss_att=50.865, acc=0.715, loss=56.507, grad_norm=5.058, loss_scale=1.000, learning_rate=7.388e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 01:50:37,764 (deepspeed_trainer:228) INFO: 30epoch:train:4701-4800batch: iter_time=1.165e-04, loss_ctc=74.521, loss_att=55.877, acc=0.716, loss=61.439, grad_norm=5.131, loss_scale=1.000, learning_rate=7.388e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 01:51:11,690 (deepspeed_trainer:228) INFO: 30epoch:train:4801-4900batch: iter_time=1.156e-04, loss_ctc=72.353, loss_att=54.711, acc=0.722, loss=59.998, grad_norm=5.023, loss_scale=1.000, learning_rate=7.387e-05, step_time=0.339 [2024-12-08 01:51:45,636] [INFO] [logging.py:129:log_dist] [Rank 0] step=440000, skipped=0, lr=[np.float64(7.385481066172612e-05)], mom=[[0.9, 0.98]] [2024-12-08 01:51:45,636] [INFO] [timer.py:264:stop] epoch=0/micro_step=50000/global_step=50000, RunningAvgSamplesPerSec=49.27403287688762, CurrSamplesPerSec=50.5996543453674, MemAllocated=2.04GB, MaxMemAllocated=29.1GB [cnode7-012:0/16] 2024-12-08 01:51:45,639 (deepspeed_trainer:228) INFO: 30epoch:train:4901-5000batch: iter_time=1.162e-04, loss_ctc=72.085, loss_att=51.104, acc=0.717, loss=57.399, grad_norm=4.990, loss_scale=1.000, learning_rate=7.386e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 01:52:20,106 (deepspeed_trainer:228) INFO: 30epoch:train:5001-5100batch: iter_time=1.171e-04, loss_ctc=69.847, loss_att=57.930, acc=0.709, loss=61.516, grad_norm=4.597, loss_scale=1.000, learning_rate=7.385e-05, step_time=0.347 [cnode7-012:0/16] 2024-12-08 01:52:53,899 (deepspeed_trainer:228) INFO: 30epoch:train:5101-5200batch: iter_time=1.155e-04, loss_ctc=72.015, loss_att=53.608, acc=0.714, loss=59.148, grad_norm=5.184, loss_scale=1.000, learning_rate=7.384e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 01:53:27,414 (deepspeed_trainer:228) INFO: 30epoch:train:5201-5300batch: iter_time=1.166e-04, loss_ctc=68.904, loss_att=51.735, acc=0.717, loss=56.888, grad_norm=5.497, loss_scale=1.000, learning_rate=7.383e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 01:54:01,436 (deepspeed_trainer:228) INFO: 30epoch:train:5301-5400batch: iter_time=1.173e-04, loss_ctc=81.548, loss_att=57.104, acc=0.717, loss=64.413, grad_norm=6.068, loss_scale=1.000, learning_rate=7.383e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 01:54:34,862 (deepspeed_trainer:228) INFO: 30epoch:train:5401-5500batch: iter_time=1.161e-04, loss_ctc=67.770, loss_att=49.871, acc=0.723, loss=55.252, grad_norm=5.078, loss_scale=1.000, learning_rate=7.382e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 01:55:08,041 (deepspeed_trainer:228) INFO: 30epoch:train:5501-5600batch: iter_time=1.191e-04, loss_ctc=71.658, loss_att=55.740, acc=0.705, loss=60.487, grad_norm=5.049, loss_scale=1.000, learning_rate=7.381e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 01:55:20,524 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 01:55:47,193 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 01:56:04,823 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 01:56:04,824 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 01:56:04,826 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 01:56:31,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:28,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:31,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:28,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:29,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:32,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:33,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:31,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:34,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:35,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:33,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:33,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:36,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:34,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:37,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:56:35,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:19,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:16,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:16,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:19,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:17,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:22,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:24,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:21,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:24,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:24,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:26,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:26,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:23,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:24,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:24,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:57:24,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:04,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:07,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:05,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:08,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:08,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:12,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:12,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:10,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:13,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:14,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:12,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:15,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:15,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:13,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:13,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:14,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:52,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:56,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:56,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:55,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:58,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:01,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:58:59,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:02,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:02,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:00,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:04,827] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:01,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:05,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:02,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:02,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 01:59:05,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 01:59:54,823 (deepspeed_trainer:228) INFO: 30epoch:train:5601-5700batch: iter_time=2.512, loss_ctc=66.919, loss_att=57.226, acc=0.709, loss=60.133, grad_norm=5.522, loss_scale=1.000, learning_rate=7.380e-05, step_time=0.355 [cnode7-012:0/16] 2024-12-08 02:00:28,345 (deepspeed_trainer:228) INFO: 30epoch:train:5701-5800batch: iter_time=1.081e-04, loss_ctc=69.899, loss_att=52.517, acc=0.725, loss=57.721, grad_norm=5.287, loss_scale=1.000, learning_rate=7.379e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 02:01:01,935 (deepspeed_trainer:228) INFO: 30epoch:train:5801-5900batch: iter_time=1.091e-04, loss_ctc=77.287, loss_att=56.709, acc=0.731, loss=62.861, grad_norm=4.862, loss_scale=1.000, learning_rate=7.378e-05, step_time=0.336 [2024-12-08 02:01:35,130] [INFO] [logging.py:129:log_dist] [Rank 0] step=441000, skipped=0, lr=[np.float64(7.377102771575816e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:01:35,131] [INFO] [timer.py:264:stop] epoch=0/micro_step=51000/global_step=51000, RunningAvgSamplesPerSec=49.2643840029056, CurrSamplesPerSec=50.64035689600937, MemAllocated=2.04GB, MaxMemAllocated=29.1GB [cnode7-012:0/16] 2024-12-08 02:01:35,134 (deepspeed_trainer:228) INFO: 30epoch:train:5901-6000batch: iter_time=1.094e-04, loss_ctc=67.001, loss_att=47.293, acc=0.723, loss=53.211, grad_norm=5.005, loss_scale=1.000, learning_rate=7.378e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:02:08,693 (deepspeed_trainer:228) INFO: 30epoch:train:6001-6100batch: iter_time=1.085e-04, loss_ctc=68.166, loss_att=52.262, acc=0.722, loss=57.022, grad_norm=4.983, loss_scale=1.000, learning_rate=7.377e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 02:02:42,187 (deepspeed_trainer:228) INFO: 30epoch:train:6101-6200batch: iter_time=1.075e-04, loss_ctc=76.164, loss_att=56.892, acc=0.717, loss=62.674, grad_norm=5.220, loss_scale=1.000, learning_rate=7.376e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 02:03:16,196 (deepspeed_trainer:228) INFO: 30epoch:train:6201-6300batch: iter_time=1.069e-04, loss_ctc=70.314, loss_att=51.576, acc=0.724, loss=57.202, grad_norm=4.871, loss_scale=1.000, learning_rate=7.375e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:03:49,236 (deepspeed_trainer:228) INFO: 30epoch:train:6301-6400batch: iter_time=1.106e-04, loss_ctc=74.526, loss_att=61.964, acc=0.706, loss=65.732, grad_norm=5.355, loss_scale=1.000, learning_rate=7.374e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 02:04:22,656 (deepspeed_trainer:228) INFO: 30epoch:train:6401-6500batch: iter_time=1.071e-04, loss_ctc=69.093, loss_att=52.442, acc=0.715, loss=57.447, grad_norm=5.270, loss_scale=1.000, learning_rate=7.373e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 02:04:55,692 (deepspeed_trainer:228) INFO: 30epoch:train:6501-6600batch: iter_time=1.091e-04, loss_ctc=69.978, loss_att=52.500, acc=0.723, loss=57.741, grad_norm=4.914, loss_scale=1.000, learning_rate=7.373e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 02:05:28,614 (deepspeed_trainer:228) INFO: 30epoch:train:6601-6700batch: iter_time=1.067e-04, loss_ctc=71.936, loss_att=53.485, acc=0.723, loss=59.012, grad_norm=5.417, loss_scale=1.000, learning_rate=7.372e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 02:06:02,029 (deepspeed_trainer:228) INFO: 30epoch:train:6701-6800batch: iter_time=1.073e-04, loss_ctc=74.530, loss_att=55.708, acc=0.724, loss=61.363, grad_norm=5.029, loss_scale=1.000, learning_rate=7.371e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 02:06:35,079 (deepspeed_trainer:228) INFO: 30epoch:train:6801-6900batch: iter_time=1.066e-04, loss_ctc=68.889, loss_att=50.308, acc=0.723, loss=55.871, grad_norm=4.787, loss_scale=1.000, learning_rate=7.370e-05, step_time=0.330 [2024-12-08 02:07:08,599] [INFO] [logging.py:129:log_dist] [Rank 0] step=442000, skipped=0, lr=[np.float64(7.36875292614748e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:07:08,600] [INFO] [timer.py:264:stop] epoch=0/micro_step=52000/global_step=52000, RunningAvgSamplesPerSec=49.26828260257276, CurrSamplesPerSec=49.336873758854715, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:07:08,603 (deepspeed_trainer:228) INFO: 30epoch:train:6901-7000batch: iter_time=1.084e-04, loss_ctc=72.884, loss_att=62.510, acc=0.710, loss=65.612, grad_norm=4.976, loss_scale=1.000, learning_rate=7.369e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 02:07:42,186 (deepspeed_trainer:228) INFO: 30epoch:train:7001-7100batch: iter_time=1.090e-04, loss_ctc=71.396, loss_att=53.944, acc=0.724, loss=59.176, grad_norm=5.007, loss_scale=1.000, learning_rate=7.368e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 02:08:15,710 (deepspeed_trainer:228) INFO: 30epoch:train:7101-7200batch: iter_time=1.078e-04, loss_ctc=73.117, loss_att=56.778, acc=0.722, loss=61.673, grad_norm=5.509, loss_scale=1.000, learning_rate=7.367e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 02:08:49,056 (deepspeed_trainer:228) INFO: 30epoch:train:7201-7300batch: iter_time=1.076e-04, loss_ctc=76.566, loss_att=54.134, acc=0.725, loss=60.858, grad_norm=4.761, loss_scale=1.000, learning_rate=7.367e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:09:22,193 (deepspeed_trainer:228) INFO: 30epoch:train:7301-7400batch: iter_time=1.112e-04, loss_ctc=69.803, loss_att=49.683, acc=0.720, loss=55.712, grad_norm=5.558, loss_scale=1.000, learning_rate=7.366e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:09:55,493 (deepspeed_trainer:228) INFO: 30epoch:train:7401-7500batch: iter_time=1.102e-04, loss_ctc=69.744, loss_att=55.413, acc=0.709, loss=59.721, grad_norm=5.204, loss_scale=1.000, learning_rate=7.365e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:09:59,778 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 02:10:26,085 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 02:10:42,684 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 02:10:42,685 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 02:10:42,687 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 02:11:09,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:06,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:10,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:08,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:08,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:11,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:11,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:13,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:14,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:14,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:12,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:12,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:12,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:13,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:15,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:16,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:56,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:55,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:57,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:56,522] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:11:56,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:03,000] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:03,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:00,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:03,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:03,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:01,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:01,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:02,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:02,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:05,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:05,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:45,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:46,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:45,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:46,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:46,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:52,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:50,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:53,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:51,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:53,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:51,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:52,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:52,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:54,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:55,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:12:55,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:33,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:33,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:33,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:33,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:34,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:40,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:42,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:43,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:41,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:41,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:42,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:42,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:44,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:44,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:45,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:13:46,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 02:14:45,046 (deepspeed_trainer:228) INFO: 30epoch:train:7501-7600batch: iter_time=2.534, loss_ctc=67.537, loss_att=58.228, acc=0.717, loss=61.014, grad_norm=4.900, loss_scale=1.000, learning_rate=7.364e-05, step_time=0.361 [cnode7-012:0/16] 2024-12-08 02:15:18,262 (deepspeed_trainer:228) INFO: 30epoch:train:7601-7700batch: iter_time=1.067e-04, loss_ctc=71.147, loss_att=52.041, acc=0.731, loss=57.760, grad_norm=5.245, loss_scale=1.000, learning_rate=7.363e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:15:51,366 (deepspeed_trainer:228) INFO: 30epoch:train:7701-7800batch: iter_time=1.081e-04, loss_ctc=74.604, loss_att=51.023, acc=0.735, loss=58.115, grad_norm=5.000, loss_scale=1.000, learning_rate=7.363e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:16:24,245 (deepspeed_trainer:228) INFO: 30epoch:train:7801-7900batch: iter_time=1.083e-04, loss_ctc=67.393, loss_att=49.462, acc=0.724, loss=54.852, grad_norm=5.125, loss_scale=1.000, learning_rate=7.362e-05, step_time=0.329 [2024-12-08 02:16:58,179] [INFO] [logging.py:129:log_dist] [Rank 0] step=443000, skipped=0, lr=[np.float64(7.360431369248834e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:16:58,179] [INFO] [timer.py:264:stop] epoch=0/micro_step=53000/global_step=53000, RunningAvgSamplesPerSec=49.26445269418329, CurrSamplesPerSec=49.44658402513629, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:16:58,182 (deepspeed_trainer:228) INFO: 30epoch:train:7901-8000batch: iter_time=1.072e-04, loss_ctc=70.433, loss_att=52.633, acc=0.725, loss=57.964, grad_norm=4.990, loss_scale=1.000, learning_rate=7.361e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 02:17:31,414 (deepspeed_trainer:228) INFO: 30epoch:train:8001-8100batch: iter_time=1.077e-04, loss_ctc=70.432, loss_att=50.798, acc=0.732, loss=56.695, grad_norm=5.014, loss_scale=1.000, learning_rate=7.360e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:18:04,750 (deepspeed_trainer:228) INFO: 30epoch:train:8101-8200batch: iter_time=1.070e-04, loss_ctc=75.530, loss_att=58.421, acc=0.709, loss=63.570, grad_norm=5.805, loss_scale=1.000, learning_rate=7.359e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:18:38,010 (deepspeed_trainer:228) INFO: 30epoch:train:8201-8300batch: iter_time=1.065e-04, loss_ctc=71.653, loss_att=56.302, acc=0.713, loss=60.929, grad_norm=5.032, loss_scale=1.000, learning_rate=7.358e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:19:10,876 (deepspeed_trainer:228) INFO: 30epoch:train:8301-8400batch: iter_time=1.085e-04, loss_ctc=65.179, loss_att=50.647, acc=0.713, loss=55.012, grad_norm=5.730, loss_scale=1.000, learning_rate=7.358e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 02:19:43,997 (deepspeed_trainer:228) INFO: 30epoch:train:8401-8500batch: iter_time=1.091e-04, loss_ctc=71.791, loss_att=53.788, acc=0.726, loss=59.185, grad_norm=5.272, loss_scale=1.000, learning_rate=7.357e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:20:17,307 (deepspeed_trainer:228) INFO: 30epoch:train:8501-8600batch: iter_time=1.096e-04, loss_ctc=73.004, loss_att=53.840, acc=0.726, loss=59.572, grad_norm=5.415, loss_scale=1.000, learning_rate=7.356e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:20:50,601 (deepspeed_trainer:228) INFO: 30epoch:train:8601-8700batch: iter_time=1.095e-04, loss_ctc=74.143, loss_att=56.537, acc=0.718, loss=61.804, grad_norm=5.421, loss_scale=1.000, learning_rate=7.355e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:21:24,579 (deepspeed_trainer:228) INFO: 30epoch:train:8701-8800batch: iter_time=1.099e-04, loss_ctc=75.796, loss_att=61.929, acc=0.721, loss=66.082, grad_norm=5.104, loss_scale=1.000, learning_rate=7.354e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 02:21:58,213 (deepspeed_trainer:228) INFO: 30epoch:train:8801-8900batch: iter_time=1.116e-04, loss_ctc=66.462, loss_att=50.118, acc=0.722, loss=55.027, grad_norm=5.232, loss_scale=1.000, learning_rate=7.353e-05, step_time=0.336 [2024-12-08 02:22:31,445] [INFO] [logging.py:129:log_dist] [Rank 0] step=444000, skipped=0, lr=[np.float64(7.352137941508121e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:22:31,446] [INFO] [timer.py:264:stop] epoch=0/micro_step=54000/global_step=54000, RunningAvgSamplesPerSec=49.26880186623226, CurrSamplesPerSec=47.3125003486224, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:22:31,449 (deepspeed_trainer:228) INFO: 30epoch:train:8901-9000batch: iter_time=1.113e-04, loss_ctc=73.382, loss_att=56.699, acc=0.721, loss=61.705, grad_norm=5.548, loss_scale=1.000, learning_rate=7.353e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:23:04,562 (deepspeed_trainer:228) INFO: 30epoch:train:9001-9100batch: iter_time=1.111e-04, loss_ctc=72.320, loss_att=53.290, acc=0.723, loss=58.993, grad_norm=4.864, loss_scale=1.000, learning_rate=7.352e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:23:37,672 (deepspeed_trainer:228) INFO: 30epoch:train:9101-9200batch: iter_time=1.080e-04, loss_ctc=72.948, loss_att=52.092, acc=0.728, loss=58.357, grad_norm=4.984, loss_scale=1.000, learning_rate=7.351e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:24:10,973 (deepspeed_trainer:228) INFO: 30epoch:train:9201-9300batch: iter_time=1.100e-04, loss_ctc=69.886, loss_att=51.508, acc=0.716, loss=57.019, grad_norm=5.333, loss_scale=1.000, learning_rate=7.350e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:24:40,885 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 02:25:07,876 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 02:25:26,037 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 02:25:26,037 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 02:25:26,040 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 02:25:47,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:48,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:49,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:49,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:50,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:50,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:52,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:53,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:53,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:54,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:53,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:53,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:54,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:54,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:55,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:25:56,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:35,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:37,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:38,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:39,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:39,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:40,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:41,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:42,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:43,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:43,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:44,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:43,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:44,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:45,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:45,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:26:46,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:25,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:26,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:27,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:27,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:30,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:30,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:32,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:31,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:32,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:32,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:33,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:33,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:33,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:33,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:34,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:27:37,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:12,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:16,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:16,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:19,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:19,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:19,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:21,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:21,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:21,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:22,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:23,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:23,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:23,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:23,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:25,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:28:26,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 02:29:01,563 (deepspeed_trainer:228) INFO: 30epoch:train:9301-9400batch: iter_time=2.544, loss_ctc=64.714, loss_att=51.726, acc=0.712, loss=55.616, grad_norm=5.302, loss_scale=1.000, learning_rate=7.349e-05, step_time=0.366 [cnode7-012:0/16] 2024-12-08 02:29:35,641 (deepspeed_trainer:228) INFO: 30epoch:train:9401-9500batch: iter_time=1.133e-04, loss_ctc=74.742, loss_att=60.109, acc=0.720, loss=64.491, grad_norm=5.362, loss_scale=1.000, learning_rate=7.348e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 02:30:09,035 (deepspeed_trainer:228) INFO: 30epoch:train:9501-9600batch: iter_time=1.076e-04, loss_ctc=68.521, loss_att=50.283, acc=0.738, loss=55.767, grad_norm=4.906, loss_scale=1.000, learning_rate=7.348e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 02:30:42,700 (deepspeed_trainer:228) INFO: 30epoch:train:9601-9700batch: iter_time=1.113e-04, loss_ctc=74.443, loss_att=50.896, acc=0.727, loss=57.978, grad_norm=5.228, loss_scale=1.000, learning_rate=7.347e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 02:31:15,813 (deepspeed_trainer:228) INFO: 30epoch:train:9701-9800batch: iter_time=1.098e-04, loss_ctc=66.259, loss_att=48.874, acc=0.720, loss=54.092, grad_norm=4.690, loss_scale=1.000, learning_rate=7.346e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:31:49,391 (deepspeed_trainer:228) INFO: 30epoch:train:9801-9900batch: iter_time=1.095e-04, loss_ctc=70.512, loss_att=54.428, acc=0.719, loss=59.273, grad_norm=5.329, loss_scale=1.000, learning_rate=7.345e-05, step_time=0.336 [2024-12-08 02:32:23,104] [INFO] [logging.py:129:log_dist] [Rank 0] step=445000, skipped=0, lr=[np.float64(7.343872484807773e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:32:23,105] [INFO] [timer.py:264:stop] epoch=0/micro_step=55000/global_step=55000, RunningAvgSamplesPerSec=49.26145760840155, CurrSamplesPerSec=47.469458230969906, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:32:23,108 (deepspeed_trainer:228) INFO: 30epoch:train:9901-10000batch: iter_time=1.095e-04, loss_ctc=71.683, loss_att=49.817, acc=0.719, loss=56.374, grad_norm=4.950, loss_scale=1.000, learning_rate=7.344e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 02:32:57,886 (deepspeed_trainer:228) INFO: 30epoch:train:10001-10100batch: iter_time=1.097e-04, loss_ctc=74.895, loss_att=61.774, acc=0.698, loss=65.703, grad_norm=5.454, loss_scale=1.000, learning_rate=7.343e-05, step_time=0.348 [cnode7-012:0/16] 2024-12-08 02:33:31,193 (deepspeed_trainer:228) INFO: 30epoch:train:10101-10200batch: iter_time=1.113e-04, loss_ctc=71.522, loss_att=54.803, acc=0.711, loss=59.831, grad_norm=5.261, loss_scale=1.000, learning_rate=7.343e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:34:04,365 (deepspeed_trainer:228) INFO: 30epoch:train:10201-10300batch: iter_time=1.090e-04, loss_ctc=64.385, loss_att=47.327, acc=0.718, loss=52.447, grad_norm=4.834, loss_scale=1.000, learning_rate=7.342e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:34:37,557 (deepspeed_trainer:228) INFO: 30epoch:train:10301-10400batch: iter_time=1.072e-04, loss_ctc=72.787, loss_att=56.077, acc=0.716, loss=61.085, grad_norm=5.256, loss_scale=1.000, learning_rate=7.341e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:35:10,813 (deepspeed_trainer:228) INFO: 30epoch:train:10401-10500batch: iter_time=1.091e-04, loss_ctc=71.511, loss_att=52.572, acc=0.725, loss=58.272, grad_norm=5.215, loss_scale=1.000, learning_rate=7.340e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:35:44,051 (deepspeed_trainer:228) INFO: 30epoch:train:10501-10600batch: iter_time=1.088e-04, loss_ctc=76.974, loss_att=57.075, acc=0.713, loss=63.012, grad_norm=5.592, loss_scale=1.000, learning_rate=7.339e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:36:17,133 (deepspeed_trainer:228) INFO: 30epoch:train:10601-10700batch: iter_time=1.076e-04, loss_ctc=67.916, loss_att=56.508, acc=0.718, loss=59.939, grad_norm=4.835, loss_scale=1.000, learning_rate=7.339e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:36:50,271 (deepspeed_trainer:228) INFO: 30epoch:train:10701-10800batch: iter_time=1.112e-04, loss_ctc=71.774, loss_att=51.648, acc=0.721, loss=57.675, grad_norm=5.036, loss_scale=1.000, learning_rate=7.338e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:37:23,513 (deepspeed_trainer:228) INFO: 30epoch:train:10801-10900batch: iter_time=1.085e-04, loss_ctc=71.974, loss_att=53.957, acc=0.717, loss=59.328, grad_norm=5.536, loss_scale=1.000, learning_rate=7.337e-05, step_time=0.332 [2024-12-08 02:37:56,757] [INFO] [logging.py:129:log_dist] [Rank 0] step=446000, skipped=0, lr=[np.float64(7.335634842271748e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:37:56,758] [INFO] [timer.py:264:stop] epoch=0/micro_step=56000/global_step=56000, RunningAvgSamplesPerSec=49.26655577767917, CurrSamplesPerSec=48.8899614690733, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:37:56,760 (deepspeed_trainer:228) INFO: 30epoch:train:10901-11000batch: iter_time=1.084e-04, loss_ctc=77.484, loss_att=56.837, acc=0.713, loss=63.002, grad_norm=5.566, loss_scale=1.000, learning_rate=7.336e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:38:29,807 (deepspeed_trainer:228) INFO: 30epoch:train:11001-11100batch: iter_time=1.103e-04, loss_ctc=67.884, loss_att=47.786, acc=0.725, loss=53.834, grad_norm=5.195, loss_scale=1.000, learning_rate=7.335e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 02:39:03,357 (deepspeed_trainer:228) INFO: 30epoch:train:11101-11200batch: iter_time=1.096e-04, loss_ctc=71.516, loss_att=54.941, acc=0.708, loss=59.874, grad_norm=5.813, loss_scale=1.000, learning_rate=7.334e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 02:39:24,511 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 02:39:51,694 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 02:40:07,649 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 02:40:07,650 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 02:40:07,652 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 02:40:33,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:33,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:33,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:33,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:34,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:34,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:35,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:34,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:36,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:35,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:36,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:36,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:37,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:37,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:37,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:40:38,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:20,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:22,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:21,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:23,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:23,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:24,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:24,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:25,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:25,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:26,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:26,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:25,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:26,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:26,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:26,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:41:27,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:08,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:10,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:11,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:13,522] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:13,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:14,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:13,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:13,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:14,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:15,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:16,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:16,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:16,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:17,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:17,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:18,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:55,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:42:57,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:00,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:02,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:02,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:01,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:02,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:04,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:03,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:04,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:05,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:05,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:05,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:06,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:06,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:43:07,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 02:43:50,816 (deepspeed_trainer:228) INFO: 30epoch:train:11201-11300batch: iter_time=2.518, loss_ctc=61.930, loss_att=48.357, acc=0.719, loss=52.425, grad_norm=5.151, loss_scale=1.000, learning_rate=7.334e-05, step_time=0.352 [cnode7-012:0/16] 2024-12-08 02:44:24,114 (deepspeed_trainer:228) INFO: 30epoch:train:11301-11400batch: iter_time=1.050e-04, loss_ctc=72.269, loss_att=60.898, acc=0.719, loss=64.350, grad_norm=5.642, loss_scale=1.000, learning_rate=7.333e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:44:57,465 (deepspeed_trainer:228) INFO: 30epoch:train:11401-11500batch: iter_time=1.116e-04, loss_ctc=73.368, loss_att=51.658, acc=0.738, loss=58.179, grad_norm=4.928, loss_scale=1.000, learning_rate=7.332e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:45:30,821 (deepspeed_trainer:228) INFO: 30epoch:train:11501-11600batch: iter_time=1.076e-04, loss_ctc=70.095, loss_att=50.364, acc=0.730, loss=56.273, grad_norm=5.559, loss_scale=1.000, learning_rate=7.331e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:46:03,743 (deepspeed_trainer:228) INFO: 30epoch:train:11601-11700batch: iter_time=1.089e-04, loss_ctc=65.147, loss_att=49.192, acc=0.722, loss=53.991, grad_norm=4.922, loss_scale=1.000, learning_rate=7.330e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 02:46:37,023 (deepspeed_trainer:228) INFO: 30epoch:train:11701-11800batch: iter_time=1.083e-04, loss_ctc=77.584, loss_att=60.036, acc=0.719, loss=65.274, grad_norm=5.293, loss_scale=1.000, learning_rate=7.329e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:47:10,218 (deepspeed_trainer:228) INFO: 30epoch:train:11801-11900batch: iter_time=1.101e-04, loss_ctc=70.472, loss_att=47.704, acc=0.727, loss=54.507, grad_norm=5.120, loss_scale=1.000, learning_rate=7.329e-05, step_time=0.332 [2024-12-08 02:47:43,778] [INFO] [logging.py:129:log_dist] [Rank 0] step=447000, skipped=0, lr=[np.float64(7.327424858253028e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:47:43,779] [INFO] [timer.py:264:stop] epoch=0/micro_step=57000/global_step=57000, RunningAvgSamplesPerSec=49.26706304201829, CurrSamplesPerSec=51.49077507955575, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:47:43,781 (deepspeed_trainer:228) INFO: 30epoch:train:11901-12000batch: iter_time=1.087e-04, loss_ctc=74.139, loss_att=63.142, acc=0.705, loss=66.456, grad_norm=6.027, loss_scale=1.000, learning_rate=7.328e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 02:48:16,906 (deepspeed_trainer:228) INFO: 30epoch:train:12001-12100batch: iter_time=1.081e-04, loss_ctc=67.740, loss_att=53.515, acc=0.717, loss=57.804, grad_norm=4.971, loss_scale=1.000, learning_rate=7.327e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:48:50,856 (deepspeed_trainer:228) INFO: 30epoch:train:12101-12200batch: iter_time=1.096e-04, loss_ctc=68.997, loss_att=49.743, acc=0.725, loss=55.508, grad_norm=4.828, loss_scale=1.000, learning_rate=7.326e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 02:49:24,114 (deepspeed_trainer:228) INFO: 30epoch:train:12201-12300batch: iter_time=1.084e-04, loss_ctc=73.668, loss_att=56.285, acc=0.721, loss=61.482, grad_norm=5.173, loss_scale=1.000, learning_rate=7.325e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:49:57,417 (deepspeed_trainer:228) INFO: 30epoch:train:12301-12400batch: iter_time=1.078e-04, loss_ctc=71.194, loss_att=54.892, acc=0.727, loss=59.778, grad_norm=5.651, loss_scale=1.000, learning_rate=7.325e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:50:30,307 (deepspeed_trainer:228) INFO: 30epoch:train:12401-12500batch: iter_time=1.132e-04, loss_ctc=71.884, loss_att=51.457, acc=0.722, loss=57.608, grad_norm=5.355, loss_scale=1.000, learning_rate=7.324e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 02:51:03,663 (deepspeed_trainer:228) INFO: 30epoch:train:12501-12600batch: iter_time=1.087e-04, loss_ctc=69.106, loss_att=58.197, acc=0.720, loss=61.478, grad_norm=5.028, loss_scale=1.000, learning_rate=7.323e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 02:51:36,853 (deepspeed_trainer:228) INFO: 30epoch:train:12601-12700batch: iter_time=1.078e-04, loss_ctc=70.948, loss_att=54.876, acc=0.721, loss=59.693, grad_norm=5.693, loss_scale=1.000, learning_rate=7.322e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:52:09,929 (deepspeed_trainer:228) INFO: 30epoch:train:12701-12800batch: iter_time=1.090e-04, loss_ctc=67.492, loss_att=51.442, acc=0.726, loss=56.266, grad_norm=5.179, loss_scale=1.000, learning_rate=7.321e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:52:43,108 (deepspeed_trainer:228) INFO: 30epoch:train:12801-12900batch: iter_time=1.096e-04, loss_ctc=81.039, loss_att=57.302, acc=0.722, loss=64.398, grad_norm=5.457, loss_scale=1.000, learning_rate=7.320e-05, step_time=0.331 [2024-12-08 02:53:16,361] [INFO] [logging.py:129:log_dist] [Rank 0] step=448000, skipped=0, lr=[np.float64(7.31924237832126e-05)], mom=[[0.9, 0.98]] [2024-12-08 02:53:16,362] [INFO] [timer.py:264:stop] epoch=0/micro_step=58000/global_step=58000, RunningAvgSamplesPerSec=49.27287302251243, CurrSamplesPerSec=52.173143676103884, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 02:53:16,365 (deepspeed_trainer:228) INFO: 30epoch:train:12901-13000batch: iter_time=1.137e-04, loss_ctc=67.429, loss_att=50.105, acc=0.729, loss=55.299, grad_norm=4.751, loss_scale=1.000, learning_rate=7.320e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 02:53:49,564 (deepspeed_trainer:228) INFO: 30epoch:train:13001-13100batch: iter_time=1.100e-04, loss_ctc=71.273, loss_att=55.922, acc=0.711, loss=60.524, grad_norm=5.440, loss_scale=1.000, learning_rate=7.319e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:54:02,726 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 02:54:28,974 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 02:54:45,511 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 02:54:45,511 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 02:54:45,513 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 02:55:11,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:11,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:11,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:11,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:12,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:12,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:14,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:14,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:14,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:14,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:15,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:15,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:15,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:16,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:16,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:55:16,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:00,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:00,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:00,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:00,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:01,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:01,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:03,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:03,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:04,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:04,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:04,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:04,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:04,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:05,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:06,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:06,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:48,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:48,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:49,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:49,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:50,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:51,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:51,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:53,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:54,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:54,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:54,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:54,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:54,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:55,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:56,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:56:56,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:36,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:37,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:37,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:38,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:40,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:41,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:42,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:42,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:44,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:44,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:44,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:44,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:44,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:45,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:46,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 02:57:46,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 02:58:37,699 (deepspeed_trainer:228) INFO: 30epoch:train:13101-13200batch: iter_time=2.527, loss_ctc=66.361, loss_att=54.428, acc=0.719, loss=57.987, grad_norm=5.049, loss_scale=1.000, learning_rate=7.318e-05, step_time=0.354 [cnode7-012:0/16] 2024-12-08 02:59:10,818 (deepspeed_trainer:228) INFO: 30epoch:train:13201-13300batch: iter_time=1.130e-04, loss_ctc=69.096, loss_att=50.640, acc=0.732, loss=56.170, grad_norm=5.591, loss_scale=1.000, learning_rate=7.317e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 02:59:44,245 (deepspeed_trainer:228) INFO: 30epoch:train:13301-13400batch: iter_time=1.081e-04, loss_ctc=76.426, loss_att=55.952, acc=0.735, loss=62.121, grad_norm=5.435, loss_scale=1.000, learning_rate=7.316e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:00:17,271 (deepspeed_trainer:228) INFO: 30epoch:train:13401-13500batch: iter_time=1.150e-04, loss_ctc=66.688, loss_att=46.491, acc=0.726, loss=52.540, grad_norm=5.163, loss_scale=1.000, learning_rate=7.316e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 03:00:50,508 (deepspeed_trainer:228) INFO: 30epoch:train:13501-13600batch: iter_time=1.090e-04, loss_ctc=67.885, loss_att=51.120, acc=0.727, loss=56.147, grad_norm=5.215, loss_scale=1.000, learning_rate=7.315e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 03:01:23,656 (deepspeed_trainer:228) INFO: 30epoch:train:13601-13700batch: iter_time=1.140e-04, loss_ctc=75.869, loss_att=55.772, acc=0.720, loss=61.800, grad_norm=5.247, loss_scale=1.000, learning_rate=7.314e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 03:01:56,903 (deepspeed_trainer:228) INFO: 30epoch:train:13701-13800batch: iter_time=1.100e-04, loss_ctc=69.421, loss_att=51.085, acc=0.724, loss=56.604, grad_norm=5.218, loss_scale=1.000, learning_rate=7.313e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 03:02:30,120 (deepspeed_trainer:228) INFO: 30epoch:train:13801-13900batch: iter_time=1.102e-04, loss_ctc=73.709, loss_att=60.528, acc=0.711, loss=64.493, grad_norm=5.409, loss_scale=1.000, learning_rate=7.312e-05, step_time=0.332 [2024-12-08 03:03:02,882] [INFO] [logging.py:129:log_dist] [Rank 0] step=449000, skipped=0, lr=[np.float64(7.311087249250565e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:03:02,882] [INFO] [timer.py:264:stop] epoch=0/micro_step=59000/global_step=59000, RunningAvgSamplesPerSec=49.27625716040787, CurrSamplesPerSec=49.365690008223325, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:03:02,885 (deepspeed_trainer:228) INFO: 30epoch:train:13901-14000batch: iter_time=1.114e-04, loss_ctc=68.042, loss_att=51.866, acc=0.717, loss=56.723, grad_norm=5.175, loss_scale=1.000, learning_rate=7.311e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 03:03:35,941 (deepspeed_trainer:228) INFO: 30epoch:train:14001-14100batch: iter_time=1.132e-04, loss_ctc=68.894, loss_att=51.461, acc=0.726, loss=56.695, grad_norm=4.702, loss_scale=1.000, learning_rate=7.311e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 03:04:09,341 (deepspeed_trainer:228) INFO: 30epoch:train:14101-14200batch: iter_time=1.190e-04, loss_ctc=71.133, loss_att=52.995, acc=0.726, loss=58.429, grad_norm=5.197, loss_scale=1.000, learning_rate=7.310e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:04:42,803 (deepspeed_trainer:228) INFO: 30epoch:train:14201-14300batch: iter_time=1.131e-04, loss_ctc=74.153, loss_att=56.222, acc=0.724, loss=61.594, grad_norm=5.205, loss_scale=1.000, learning_rate=7.309e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:05:15,735 (deepspeed_trainer:228) INFO: 30epoch:train:14301-14400batch: iter_time=1.136e-04, loss_ctc=68.290, loss_att=49.980, acc=0.725, loss=55.470, grad_norm=5.490, loss_scale=1.000, learning_rate=7.308e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 03:05:48,962 (deepspeed_trainer:228) INFO: 30epoch:train:14401-14500batch: iter_time=1.121e-04, loss_ctc=72.574, loss_att=61.982, acc=0.713, loss=65.191, grad_norm=6.186, loss_scale=1.000, learning_rate=7.307e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 03:06:21,900 (deepspeed_trainer:228) INFO: 30epoch:train:14501-14600batch: iter_time=1.125e-04, loss_ctc=70.280, loss_att=53.218, acc=0.727, loss=58.338, grad_norm=5.394, loss_scale=1.000, learning_rate=7.307e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 03:06:54,797 (deepspeed_trainer:228) INFO: 30epoch:train:14601-14700batch: iter_time=1.090e-04, loss_ctc=72.844, loss_att=56.123, acc=0.724, loss=61.133, grad_norm=5.759, loss_scale=1.000, learning_rate=7.306e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 03:07:27,615 (deepspeed_trainer:228) INFO: 30epoch:train:14701-14800batch: iter_time=1.099e-04, loss_ctc=75.518, loss_att=53.342, acc=0.730, loss=60.009, grad_norm=5.772, loss_scale=1.000, learning_rate=7.305e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 03:08:00,419 (deepspeed_trainer:228) INFO: 30epoch:train:14801-14900batch: iter_time=1.089e-04, loss_ctc=69.084, loss_att=49.278, acc=0.723, loss=55.225, grad_norm=5.142, loss_scale=1.000, learning_rate=7.304e-05, step_time=0.328 [2024-12-08 03:08:33,045] [INFO] [logging.py:129:log_dist] [Rank 0] step=450000, skipped=0, lr=[np.float64(7.30295931900748e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:08:33,046] [INFO] [timer.py:264:stop] epoch=0/micro_step=60000/global_step=60000, RunningAvgSamplesPerSec=49.28828642340153, CurrSamplesPerSec=53.85917365963497, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:08:33,048 (deepspeed_trainer:228) INFO: 30epoch:train:14901-15000batch: iter_time=1.073e-04, loss_ctc=69.714, loss_att=55.032, acc=0.711, loss=59.432, grad_norm=5.691, loss_scale=1.000, learning_rate=7.303e-05, step_time=0.326 [2024-12-08 03:08:49,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:50,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:51,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:51,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:51,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:08:51,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:05,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:05,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:06,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:06,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:06,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:06,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:07,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:07,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:07,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:07,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:07,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:08,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:08,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:08,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:08,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:08,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:20,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:21,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:21,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:21,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:22,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:22,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:22,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:23,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:23,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:23,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:24,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:24,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:24,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:24,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:25,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:25,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:35,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:36,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:37,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:37,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:37,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:38,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:38,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:38,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:39,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:39,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:39,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:39,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:40,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:40,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:41,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:41,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:09:53,866] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 30 is about to be saved! [2024-12-08 03:09:53,898] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/mp_rank_00_model_states.pt [2024-12-08 03:09:53,898] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/mp_rank_00_model_states.pt... [2024-12-08 03:09:56,055] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/mp_rank_00_model_states.pt. [2024-12-08 03:09:56,448] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,214] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,214] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,214] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,450] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,215] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,452] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,452] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,219] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,454] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,219] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,455] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,456] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,221] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,222] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,457] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 03:09:56,826] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 03:09:56,830] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 03:09:56,830] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,005] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,005] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,005] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,019] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,019] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,019] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,023] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,023] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,023] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,025] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,025] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,025] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,039] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,039] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,039] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,277] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,277] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,277] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,303] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,303] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,076] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,076] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,076] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,324] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,325] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,325] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,350] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,350] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,350] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,116] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,116] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,116] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,375] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,375] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,375] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,376] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,377] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,377] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,399] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,399] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,399] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [2024-12-08 03:09:57,401] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 03:09:57,401] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_30/30/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 03:09:57,401] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 30 is ready now! [cnode7-012:0/16] 2024-12-08 03:09:57,176 (deepspeed_trainer:158) INFO: 30epoch results: [train] iter_time=0.134, loss_ctc=71.511, loss_att=53.906, acc=0.718, loss=59.188, grad_norm=5.223, loss_scale=1.000, learning_rate=7.365e-05, step_time=0.334, time=1 hour, 57 minutes and 22.1 seconds, total_count=450030, gpu_max_cached_mem_GB=40.352, [valid] loss_ctc=3.641, cer_ctc=0.089, loss_att=7.281, acc=0.762, cer=0.483, wer=1.000, loss=6.188, time=1 minute and 10.67 seconds, total_count=30, gpu_max_cached_mem_GB=40.352 [cnode7-012:0/16] 2024-12-08 03:10:00,490 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 03:10:27,234 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 03:10:42,999 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 03:10:43,000 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 03:10:43,002 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 03:11:03,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:04,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:04,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:04,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:05,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:05,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:05,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:05,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:05,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:06,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:06,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:06,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:07,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:07,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:08,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:08,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:51,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:52,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:52,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:53,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:54,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:55,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:55,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:55,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:56,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:56,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:56,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:57,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:57,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:57,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:57,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:11:58,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:39,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:40,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:40,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:42,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:42,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:43,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:44,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:44,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:45,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:45,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:45,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:46,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:46,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:47,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:47,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:12:48,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:26,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:27,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:27,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:30,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:30,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:32,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:32,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:33,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:34,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:34,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:34,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:34,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:35,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:35,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:35,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:13:38,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 03:14:39,060 (deepspeed_trainer:228) INFO: 31epoch:train:1-100batch: iter_time=2.432, loss_ctc=71.011, loss_att=57.507, acc=0.690, loss=61.543, grad_norm=5.920, loss_scale=1.000, learning_rate=7.303e-05, step_time=0.353 [cnode7-012:0/16] 2024-12-08 03:15:12,601 (deepspeed_trainer:228) INFO: 31epoch:train:101-200batch: iter_time=1.177e-04, loss_ctc=68.740, loss_att=49.808, acc=0.714, loss=55.479, grad_norm=6.069, loss_scale=1.000, learning_rate=7.302e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 03:15:45,943 (deepspeed_trainer:228) INFO: 31epoch:train:201-300batch: iter_time=1.142e-04, loss_ctc=69.567, loss_att=53.065, acc=0.704, loss=58.042, grad_norm=6.678, loss_scale=1.000, learning_rate=7.301e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:16:19,313 (deepspeed_trainer:228) INFO: 31epoch:train:301-400batch: iter_time=1.202e-04, loss_ctc=73.118, loss_att=60.638, acc=0.687, loss=64.379, grad_norm=6.147, loss_scale=1.000, learning_rate=7.300e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:16:52,631 (deepspeed_trainer:228) INFO: 31epoch:train:401-500batch: iter_time=1.177e-04, loss_ctc=74.244, loss_att=61.706, acc=0.699, loss=65.484, grad_norm=5.396, loss_scale=1.000, learning_rate=7.299e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:17:26,236 (deepspeed_trainer:228) INFO: 31epoch:train:501-600batch: iter_time=1.112e-04, loss_ctc=79.426, loss_att=59.727, acc=0.704, loss=65.638, grad_norm=5.510, loss_scale=1.000, learning_rate=7.298e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:18:00,109 (deepspeed_trainer:228) INFO: 31epoch:train:601-700batch: iter_time=1.119e-04, loss_ctc=64.335, loss_att=50.528, acc=0.712, loss=54.667, grad_norm=5.347, loss_scale=1.000, learning_rate=7.298e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 03:18:33,705 (deepspeed_trainer:228) INFO: 31epoch:train:701-800batch: iter_time=1.128e-04, loss_ctc=75.528, loss_att=51.755, acc=0.718, loss=58.891, grad_norm=5.649, loss_scale=1.000, learning_rate=7.297e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:19:07,075 (deepspeed_trainer:228) INFO: 31epoch:train:801-900batch: iter_time=1.139e-04, loss_ctc=65.764, loss_att=50.562, acc=0.702, loss=55.122, grad_norm=5.512, loss_scale=1.000, learning_rate=7.296e-05, step_time=0.333 [2024-12-08 03:19:40,503] [INFO] [logging.py:129:log_dist] [Rank 0] step=451000, skipped=0, lr=[np.float64(7.294858436739053e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:19:40,504] [INFO] [timer.py:264:stop] epoch=0/micro_step=61000/global_step=61000, RunningAvgSamplesPerSec=49.28407209494289, CurrSamplesPerSec=48.61984554764698, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:19:40,507 (deepspeed_trainer:228) INFO: 31epoch:train:901-1000batch: iter_time=1.118e-04, loss_ctc=75.281, loss_att=58.212, acc=0.713, loss=63.351, grad_norm=5.871, loss_scale=1.000, learning_rate=7.295e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:20:14,098 (deepspeed_trainer:228) INFO: 31epoch:train:1001-1100batch: iter_time=1.130e-04, loss_ctc=69.364, loss_att=50.129, acc=0.715, loss=55.897, grad_norm=5.260, loss_scale=1.000, learning_rate=7.294e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:20:47,456 (deepspeed_trainer:228) INFO: 31epoch:train:1101-1200batch: iter_time=1.107e-04, loss_ctc=70.188, loss_att=51.917, acc=0.722, loss=57.390, grad_norm=5.226, loss_scale=1.000, learning_rate=7.294e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:21:21,169 (deepspeed_trainer:228) INFO: 31epoch:train:1201-1300batch: iter_time=1.125e-04, loss_ctc=77.668, loss_att=62.079, acc=0.713, loss=66.709, grad_norm=5.376, loss_scale=1.000, learning_rate=7.293e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:21:54,862 (deepspeed_trainer:228) INFO: 31epoch:train:1301-1400batch: iter_time=1.124e-04, loss_ctc=71.983, loss_att=56.556, acc=0.711, loss=61.193, grad_norm=5.585, loss_scale=1.000, learning_rate=7.292e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 03:22:28,261 (deepspeed_trainer:228) INFO: 31epoch:train:1401-1500batch: iter_time=1.364e-04, loss_ctc=76.532, loss_att=57.114, acc=0.712, loss=62.888, grad_norm=7.398, loss_scale=1.000, learning_rate=7.291e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:23:01,562 (deepspeed_trainer:228) INFO: 31epoch:train:1501-1600batch: iter_time=1.121e-04, loss_ctc=69.079, loss_att=51.257, acc=0.707, loss=56.589, grad_norm=6.248, loss_scale=1.000, learning_rate=7.290e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:23:35,082 (deepspeed_trainer:228) INFO: 31epoch:train:1601-1700batch: iter_time=1.119e-04, loss_ctc=67.809, loss_att=49.686, acc=0.738, loss=55.126, grad_norm=5.468, loss_scale=1.000, learning_rate=7.290e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 03:24:08,739 (deepspeed_trainer:228) INFO: 31epoch:train:1701-1800batch: iter_time=1.119e-04, loss_ctc=70.902, loss_att=53.236, acc=0.717, loss=58.554, grad_norm=5.663, loss_scale=1.000, learning_rate=7.289e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:24:39,698 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 03:25:05,334 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 03:25:21,779 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 03:25:21,779 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 03:25:21,781 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 03:25:47,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:48,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:48,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:51,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:53,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:53,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:53,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:53,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:52,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:25:53,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:37,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:37,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:37,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:40,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:41,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:41,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:42,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:42,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:44,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:44,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:45,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:45,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:45,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:46,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:46,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:26:46,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:26,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:27,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:27,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:30,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:30,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:31,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:31,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:31,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:33,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:34,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:34,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:35,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:35,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:35,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:35,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:27:36,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:13,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:15,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:15,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:19,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:20,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:20,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:20,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:21,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:22,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:22,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:23,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:24,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:24,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:25,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:25,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:28:27,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 03:29:02,577 (deepspeed_trainer:228) INFO: 31epoch:train:1801-1900batch: iter_time=2.537, loss_ctc=78.421, loss_att=58.553, acc=0.712, loss=64.492, grad_norm=5.510, loss_scale=1.000, learning_rate=7.288e-05, step_time=0.401 [2024-12-08 03:29:36,744] [INFO] [logging.py:129:log_dist] [Rank 0] step=452000, skipped=0, lr=[np.float64(7.286784452761091e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:29:36,745] [INFO] [timer.py:264:stop] epoch=0/micro_step=62000/global_step=62000, RunningAvgSamplesPerSec=49.26603035903661, CurrSamplesPerSec=45.73454234255047, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:29:36,748 (deepspeed_trainer:228) INFO: 31epoch:train:1901-2000batch: iter_time=1.145e-04, loss_ctc=70.922, loss_att=56.448, acc=0.703, loss=60.815, grad_norm=6.101, loss_scale=1.000, learning_rate=7.287e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 03:30:10,543 (deepspeed_trainer:228) INFO: 31epoch:train:2001-2100batch: iter_time=1.152e-04, loss_ctc=64.729, loss_att=45.234, acc=0.721, loss=51.075, grad_norm=5.708, loss_scale=1.000, learning_rate=7.286e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 03:30:44,347 (deepspeed_trainer:228) INFO: 31epoch:train:2101-2200batch: iter_time=1.111e-04, loss_ctc=72.082, loss_att=56.254, acc=0.714, loss=60.992, grad_norm=5.820, loss_scale=1.000, learning_rate=7.286e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 03:31:18,441 (deepspeed_trainer:228) INFO: 31epoch:train:2201-2300batch: iter_time=1.074e-04, loss_ctc=71.584, loss_att=59.184, acc=0.704, loss=62.910, grad_norm=5.197, loss_scale=1.000, learning_rate=7.285e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 03:31:52,028 (deepspeed_trainer:228) INFO: 31epoch:train:2301-2400batch: iter_time=1.055e-04, loss_ctc=75.864, loss_att=64.755, acc=0.708, loss=68.060, grad_norm=5.793, loss_scale=1.000, learning_rate=7.284e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:32:25,441 (deepspeed_trainer:228) INFO: 31epoch:train:2401-2500batch: iter_time=1.109e-04, loss_ctc=72.323, loss_att=56.554, acc=0.713, loss=61.257, grad_norm=6.069, loss_scale=1.000, learning_rate=7.283e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:32:58,777 (deepspeed_trainer:228) INFO: 31epoch:train:2501-2600batch: iter_time=1.127e-04, loss_ctc=66.825, loss_att=52.825, acc=0.716, loss=57.013, grad_norm=5.121, loss_scale=1.000, learning_rate=7.282e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:33:32,177 (deepspeed_trainer:228) INFO: 31epoch:train:2601-2700batch: iter_time=1.128e-04, loss_ctc=75.489, loss_att=54.250, acc=0.713, loss=60.615, grad_norm=6.122, loss_scale=1.000, learning_rate=7.282e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:34:05,580 (deepspeed_trainer:228) INFO: 31epoch:train:2701-2800batch: iter_time=1.110e-04, loss_ctc=68.775, loss_att=49.982, acc=0.726, loss=55.614, grad_norm=4.945, loss_scale=1.000, learning_rate=7.281e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:34:39,334 (deepspeed_trainer:228) INFO: 31epoch:train:2801-2900batch: iter_time=1.134e-04, loss_ctc=69.310, loss_att=54.363, acc=0.715, loss=58.851, grad_norm=5.572, loss_scale=1.000, learning_rate=7.280e-05, step_time=0.337 [2024-12-08 03:35:12,405] [INFO] [logging.py:129:log_dist] [Rank 0] step=453000, skipped=0, lr=[np.float64(7.278737218546534e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:35:12,405] [INFO] [timer.py:264:stop] epoch=0/micro_step=63000/global_step=63000, RunningAvgSamplesPerSec=49.26370811174175, CurrSamplesPerSec=45.766823981463155, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:35:12,408 (deepspeed_trainer:228) INFO: 31epoch:train:2901-3000batch: iter_time=1.115e-04, loss_ctc=69.943, loss_att=50.445, acc=0.728, loss=56.298, grad_norm=5.380, loss_scale=1.000, learning_rate=7.279e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 03:35:46,166 (deepspeed_trainer:228) INFO: 31epoch:train:3001-3100batch: iter_time=1.119e-04, loss_ctc=68.691, loss_att=53.564, acc=0.723, loss=58.139, grad_norm=5.026, loss_scale=1.000, learning_rate=7.278e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 03:36:19,795 (deepspeed_trainer:228) INFO: 31epoch:train:3101-3200batch: iter_time=1.136e-04, loss_ctc=79.865, loss_att=61.844, acc=0.721, loss=67.258, grad_norm=5.282, loss_scale=1.000, learning_rate=7.278e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:36:53,010 (deepspeed_trainer:228) INFO: 31epoch:train:3201-3300batch: iter_time=1.121e-04, loss_ctc=65.808, loss_att=50.874, acc=0.719, loss=55.372, grad_norm=5.660, loss_scale=1.000, learning_rate=7.277e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 03:37:26,654 (deepspeed_trainer:228) INFO: 31epoch:train:3301-3400batch: iter_time=1.142e-04, loss_ctc=75.483, loss_att=58.484, acc=0.715, loss=63.598, grad_norm=7.438, loss_scale=1.000, learning_rate=7.276e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:38:00,271 (deepspeed_trainer:228) INFO: 31epoch:train:3401-3500batch: iter_time=1.114e-04, loss_ctc=71.924, loss_att=55.030, acc=0.722, loss=60.080, grad_norm=5.566, loss_scale=1.000, learning_rate=7.275e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 03:38:33,566 (deepspeed_trainer:228) INFO: 31epoch:train:3501-3600batch: iter_time=1.093e-04, loss_ctc=61.671, loss_att=45.722, acc=0.739, loss=50.506, grad_norm=5.081, loss_scale=1.000, learning_rate=7.274e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:39:07,022 (deepspeed_trainer:228) INFO: 31epoch:train:3601-3700batch: iter_time=1.148e-04, loss_ctc=76.064, loss_att=57.397, acc=0.714, loss=62.983, grad_norm=5.576, loss_scale=1.000, learning_rate=7.274e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:39:27,918 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 03:39:54,093 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 03:40:12,426 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 03:40:12,426 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 03:40:12,428 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 03:40:33,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:34,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:35,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:36,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:36,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:36,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:37,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:38,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:39,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:39,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:39,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:39,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:39,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:40,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:41,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:40:43,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:20,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:22,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:23,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:24,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:23,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:27,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:28,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:27,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:28,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:28,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:29,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:28,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:29,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:29,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:29,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:41:31,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:07,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:10,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:10,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:12,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:12,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:15,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:16,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:16,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:16,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:16,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:18,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:17,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:18,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:19,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:18,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:20,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:54,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:58,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:59,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:59,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:42:59,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:04,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:05,926] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:06,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:06,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:06,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:07,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:08,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:07,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:08,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:08,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:43:10,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 03:43:53,973 (deepspeed_trainer:228) INFO: 31epoch:train:3701-3800batch: iter_time=2.507, loss_ctc=76.946, loss_att=58.629, acc=0.716, loss=64.088, grad_norm=6.063, loss_scale=1.000, learning_rate=7.273e-05, step_time=0.365 [cnode7-012:0/16] 2024-12-08 03:44:26,985 (deepspeed_trainer:228) INFO: 31epoch:train:3801-3900batch: iter_time=1.132e-04, loss_ctc=73.375, loss_att=56.278, acc=0.709, loss=61.409, grad_norm=5.477, loss_scale=1.000, learning_rate=7.272e-05, step_time=0.330 [2024-12-08 03:45:00,065] [INFO] [logging.py:129:log_dist] [Rank 0] step=454000, skipped=0, lr=[np.float64(7.270716586713985e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:45:00,065] [INFO] [timer.py:264:stop] epoch=0/micro_step=64000/global_step=64000, RunningAvgSamplesPerSec=49.258176838230376, CurrSamplesPerSec=48.50854431013891, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:45:00,068 (deepspeed_trainer:228) INFO: 31epoch:train:3901-4000batch: iter_time=1.160e-04, loss_ctc=60.180, loss_att=42.157, acc=0.731, loss=47.548, grad_norm=5.645, loss_scale=1.000, learning_rate=7.271e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 03:45:33,574 (deepspeed_trainer:228) INFO: 31epoch:train:4001-4100batch: iter_time=1.149e-04, loss_ctc=72.919, loss_att=58.398, acc=0.702, loss=62.753, grad_norm=5.371, loss_scale=1.000, learning_rate=7.270e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 03:46:07,301 (deepspeed_trainer:228) INFO: 31epoch:train:4101-4200batch: iter_time=1.105e-04, loss_ctc=78.602, loss_att=64.390, acc=0.709, loss=68.674, grad_norm=5.814, loss_scale=1.000, learning_rate=7.270e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 03:46:40,584 (deepspeed_trainer:228) INFO: 31epoch:train:4201-4300batch: iter_time=1.112e-04, loss_ctc=69.696, loss_att=59.375, acc=0.710, loss=62.451, grad_norm=5.838, loss_scale=1.000, learning_rate=7.269e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:47:13,233 (deepspeed_trainer:228) INFO: 31epoch:train:4301-4400batch: iter_time=1.121e-04, loss_ctc=66.971, loss_att=50.292, acc=0.718, loss=55.296, grad_norm=5.052, loss_scale=1.000, learning_rate=7.268e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 03:47:46,435 (deepspeed_trainer:228) INFO: 31epoch:train:4401-4500batch: iter_time=1.123e-04, loss_ctc=68.369, loss_att=54.417, acc=0.726, loss=58.604, grad_norm=6.458, loss_scale=1.000, learning_rate=7.267e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 03:48:19,367 (deepspeed_trainer:228) INFO: 31epoch:train:4501-4600batch: iter_time=1.106e-04, loss_ctc=74.366, loss_att=51.570, acc=0.718, loss=58.398, grad_norm=5.144, loss_scale=1.000, learning_rate=7.266e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 03:48:52,466 (deepspeed_trainer:228) INFO: 31epoch:train:4601-4700batch: iter_time=1.127e-04, loss_ctc=68.708, loss_att=51.542, acc=0.723, loss=56.686, grad_norm=5.200, loss_scale=1.000, learning_rate=7.266e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 03:49:25,644 (deepspeed_trainer:228) INFO: 31epoch:train:4701-4800batch: iter_time=1.123e-04, loss_ctc=71.394, loss_att=55.255, acc=0.714, loss=60.095, grad_norm=4.946, loss_scale=1.000, learning_rate=7.265e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 03:49:58,678 (deepspeed_trainer:228) INFO: 31epoch:train:4801-4900batch: iter_time=1.119e-04, loss_ctc=67.206, loss_att=50.386, acc=0.728, loss=55.437, grad_norm=4.860, loss_scale=1.000, learning_rate=7.264e-05, step_time=0.330 [2024-12-08 03:50:32,004] [INFO] [logging.py:129:log_dist] [Rank 0] step=455000, skipped=0, lr=[np.float64(7.262722411016376e-05)], mom=[[0.9, 0.98]] [2024-12-08 03:50:32,004] [INFO] [timer.py:264:stop] epoch=0/micro_step=65000/global_step=65000, RunningAvgSamplesPerSec=49.26479477897235, CurrSamplesPerSec=49.00126866520011, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 03:50:32,007 (deepspeed_trainer:228) INFO: 31epoch:train:4901-5000batch: iter_time=1.108e-04, loss_ctc=73.917, loss_att=60.476, acc=0.724, loss=64.520, grad_norm=5.615, loss_scale=1.000, learning_rate=7.263e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:51:05,884 (deepspeed_trainer:228) INFO: 31epoch:train:5001-5100batch: iter_time=1.125e-04, loss_ctc=71.766, loss_att=51.705, acc=0.731, loss=57.718, grad_norm=4.941, loss_scale=1.000, learning_rate=7.262e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 03:51:38,799 (deepspeed_trainer:228) INFO: 31epoch:train:5101-5200batch: iter_time=1.121e-04, loss_ctc=72.135, loss_att=56.074, acc=0.717, loss=60.906, grad_norm=6.094, loss_scale=1.000, learning_rate=7.262e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 03:52:11,889 (deepspeed_trainer:228) INFO: 31epoch:train:5201-5300batch: iter_time=1.165e-04, loss_ctc=69.071, loss_att=53.637, acc=0.719, loss=58.244, grad_norm=7.268, loss_scale=1.000, learning_rate=7.261e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 03:52:45,200 (deepspeed_trainer:228) INFO: 31epoch:train:5301-5400batch: iter_time=1.106e-04, loss_ctc=73.277, loss_att=56.050, acc=0.725, loss=61.218, grad_norm=5.910, loss_scale=1.000, learning_rate=7.260e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 03:53:18,121 (deepspeed_trainer:228) INFO: 31epoch:train:5401-5500batch: iter_time=1.106e-04, loss_ctc=65.489, loss_att=47.217, acc=0.732, loss=52.687, grad_norm=5.351, loss_scale=1.000, learning_rate=7.259e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 03:53:51,405 (deepspeed_trainer:228) INFO: 31epoch:train:5501-5600batch: iter_time=1.089e-04, loss_ctc=76.665, loss_att=56.597, acc=0.721, loss=62.614, grad_norm=5.285, loss_scale=1.000, learning_rate=7.258e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 03:54:03,730 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 03:54:31,265 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 03:54:47,608 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 03:54:47,608 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 03:54:47,610 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 03:55:11,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:12,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:12,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:11,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:11,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:13,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:15,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:15,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:14,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:14,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:16,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:17,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:17,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:16,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:17,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:55:17,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:00,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:01,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:01,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:00,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:00,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:01,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:05,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:05,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:06,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:05,699] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:07,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:07,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:07,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:08,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:08,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:08,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:48,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:48,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:49,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:51,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:50,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:52,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:55,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:55,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:56,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:56,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:56,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:56,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:57,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:58,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:58,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:56:58,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:36,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:35,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:37,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:39,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:41,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:42,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:43,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:45,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:46,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:46,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:44,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:46,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:46,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:48,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:48,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 03:57:50,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 03:58:45,338 (deepspeed_trainer:228) INFO: 31epoch:train:5601-5700batch: iter_time=2.541, loss_ctc=72.539, loss_att=58.644, acc=0.712, loss=62.807, grad_norm=5.168, loss_scale=1.000, learning_rate=7.258e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-08 03:59:18,752 (deepspeed_trainer:228) INFO: 31epoch:train:5701-5800batch: iter_time=1.106e-04, loss_ctc=69.841, loss_att=52.139, acc=0.725, loss=57.429, grad_norm=5.177, loss_scale=1.000, learning_rate=7.257e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 03:59:52,113 (deepspeed_trainer:228) INFO: 31epoch:train:5801-5900batch: iter_time=1.103e-04, loss_ctc=63.891, loss_att=46.388, acc=0.723, loss=51.643, grad_norm=5.276, loss_scale=1.000, learning_rate=7.256e-05, step_time=0.333 [2024-12-08 04:00:25,378] [INFO] [logging.py:129:log_dist] [Rank 0] step=456000, skipped=0, lr=[np.float64(7.254754546329756e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:00:25,378] [INFO] [timer.py:264:stop] epoch=0/micro_step=66000/global_step=66000, RunningAvgSamplesPerSec=49.255359705193285, CurrSamplesPerSec=51.11863753474943, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:00:25,381 (deepspeed_trainer:228) INFO: 31epoch:train:5901-6000batch: iter_time=1.091e-04, loss_ctc=68.094, loss_att=54.664, acc=0.704, loss=58.697, grad_norm=5.529, loss_scale=1.000, learning_rate=7.255e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 04:00:58,723 (deepspeed_trainer:228) INFO: 31epoch:train:6001-6100batch: iter_time=1.089e-04, loss_ctc=76.059, loss_att=63.820, acc=0.708, loss=67.500, grad_norm=5.387, loss_scale=1.000, learning_rate=7.254e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 04:01:32,133 (deepspeed_trainer:228) INFO: 31epoch:train:6101-6200batch: iter_time=1.131e-04, loss_ctc=77.135, loss_att=62.524, acc=0.716, loss=66.889, grad_norm=5.782, loss_scale=1.000, learning_rate=7.254e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 04:02:04,984 (deepspeed_trainer:228) INFO: 31epoch:train:6201-6300batch: iter_time=1.110e-04, loss_ctc=61.556, loss_att=47.979, acc=0.720, loss=52.035, grad_norm=5.255, loss_scale=1.000, learning_rate=7.253e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:02:38,088 (deepspeed_trainer:228) INFO: 31epoch:train:6301-6400batch: iter_time=1.096e-04, loss_ctc=70.923, loss_att=54.826, acc=0.722, loss=59.648, grad_norm=5.474, loss_scale=1.000, learning_rate=7.252e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:03:11,015 (deepspeed_trainer:228) INFO: 31epoch:train:6401-6500batch: iter_time=1.110e-04, loss_ctc=69.647, loss_att=49.529, acc=0.720, loss=55.570, grad_norm=5.127, loss_scale=1.000, learning_rate=7.251e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:03:43,944 (deepspeed_trainer:228) INFO: 31epoch:train:6501-6600batch: iter_time=1.093e-04, loss_ctc=71.641, loss_att=54.211, acc=0.724, loss=59.451, grad_norm=6.749, loss_scale=1.000, learning_rate=7.250e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:04:16,711 (deepspeed_trainer:228) INFO: 31epoch:train:6601-6700batch: iter_time=1.096e-04, loss_ctc=71.160, loss_att=53.614, acc=0.715, loss=58.853, grad_norm=5.686, loss_scale=1.000, learning_rate=7.250e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 04:04:49,476 (deepspeed_trainer:228) INFO: 31epoch:train:6701-6800batch: iter_time=1.087e-04, loss_ctc=69.251, loss_att=51.933, acc=0.725, loss=57.118, grad_norm=4.851, loss_scale=1.000, learning_rate=7.249e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:05:22,531 (deepspeed_trainer:228) INFO: 31epoch:train:6801-6900batch: iter_time=1.138e-04, loss_ctc=74.867, loss_att=59.941, acc=0.729, loss=64.417, grad_norm=5.755, loss_scale=1.000, learning_rate=7.248e-05, step_time=0.330 [2024-12-08 04:05:55,690] [INFO] [logging.py:129:log_dist] [Rank 0] step=457000, skipped=0, lr=[np.float64(7.24681284864224e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:05:55,693] [INFO] [timer.py:264:stop] epoch=0/micro_step=67000/global_step=67000, RunningAvgSamplesPerSec=49.265752550808315, CurrSamplesPerSec=47.622898231500756, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:05:55,696 (deepspeed_trainer:228) INFO: 31epoch:train:6901-7000batch: iter_time=1.093e-04, loss_ctc=73.824, loss_att=59.378, acc=0.720, loss=63.722, grad_norm=5.513, loss_scale=1.000, learning_rate=7.247e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:06:28,724 (deepspeed_trainer:228) INFO: 31epoch:train:7001-7100batch: iter_time=1.108e-04, loss_ctc=74.523, loss_att=55.512, acc=0.719, loss=61.229, grad_norm=6.733, loss_scale=1.000, learning_rate=7.246e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:07:01,672 (deepspeed_trainer:228) INFO: 31epoch:train:7101-7200batch: iter_time=1.093e-04, loss_ctc=61.901, loss_att=45.600, acc=0.728, loss=50.527, grad_norm=5.592, loss_scale=1.000, learning_rate=7.246e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:07:34,938 (deepspeed_trainer:228) INFO: 31epoch:train:7201-7300batch: iter_time=1.094e-04, loss_ctc=71.589, loss_att=54.751, acc=0.730, loss=59.784, grad_norm=5.654, loss_scale=1.000, learning_rate=7.245e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 04:08:07,816 (deepspeed_trainer:228) INFO: 31epoch:train:7301-7400batch: iter_time=1.107e-04, loss_ctc=68.916, loss_att=52.091, acc=0.727, loss=57.126, grad_norm=5.590, loss_scale=1.000, learning_rate=7.244e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:08:40,796 (deepspeed_trainer:228) INFO: 31epoch:train:7401-7500batch: iter_time=1.108e-04, loss_ctc=76.714, loss_att=56.418, acc=0.724, loss=62.507, grad_norm=5.369, loss_scale=1.000, learning_rate=7.243e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:08:45,141 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 04:09:12,911 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 04:09:30,256 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 04:09:30,256 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 04:09:30,259 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 04:09:51,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:51,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:51,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:54,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:55,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:55,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:55,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:56,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:57,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:57,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:57,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:57,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:58,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:58,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:57,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:09:58,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:38,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:38,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:42,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:42,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:45,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:44,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:46,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:46,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:46,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:47,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:46,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:47,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:46,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:47,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:48,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:10:47,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:25,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:25,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:30,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:32,533] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:32,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:35,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:34,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:35,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:36,084] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:35,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:36,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:35,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:36,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:35,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:36,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:11:37,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:11,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:12,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:18,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:20,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:22,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:23,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:23,699] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:24,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:25,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:25,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:26,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:26,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:26,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:27,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:27,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:12:28,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 04:13:28,578 (deepspeed_trainer:228) INFO: 31epoch:train:7501-7600batch: iter_time=2.547, loss_ctc=69.390, loss_att=57.221, acc=0.698, loss=60.855, grad_norm=5.734, loss_scale=1.000, learning_rate=7.242e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 04:14:01,521 (deepspeed_trainer:228) INFO: 31epoch:train:7601-7700batch: iter_time=1.130e-04, loss_ctc=66.726, loss_att=49.481, acc=0.719, loss=54.635, grad_norm=5.759, loss_scale=1.000, learning_rate=7.242e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:14:34,383 (deepspeed_trainer:228) INFO: 31epoch:train:7701-7800batch: iter_time=1.086e-04, loss_ctc=68.669, loss_att=52.639, acc=0.709, loss=57.431, grad_norm=5.844, loss_scale=1.000, learning_rate=7.241e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:15:07,260 (deepspeed_trainer:228) INFO: 31epoch:train:7801-7900batch: iter_time=1.091e-04, loss_ctc=71.044, loss_att=59.491, acc=0.694, loss=62.955, grad_norm=5.976, loss_scale=1.000, learning_rate=7.240e-05, step_time=0.329 [2024-12-08 04:15:40,320] [INFO] [logging.py:129:log_dist] [Rank 0] step=458000, skipped=0, lr=[np.float64(7.238897175043066e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:15:40,320] [INFO] [timer.py:264:stop] epoch=0/micro_step=68000/global_step=68000, RunningAvgSamplesPerSec=49.274835858118415, CurrSamplesPerSec=49.07903348543692, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:15:40,323 (deepspeed_trainer:228) INFO: 31epoch:train:7901-8000batch: iter_time=1.094e-04, loss_ctc=72.721, loss_att=61.844, acc=0.702, loss=65.133, grad_norm=6.650, loss_scale=1.000, learning_rate=7.239e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:16:13,457 (deepspeed_trainer:228) INFO: 31epoch:train:8001-8100batch: iter_time=1.090e-04, loss_ctc=76.621, loss_att=58.983, acc=0.708, loss=64.250, grad_norm=5.290, loss_scale=1.000, learning_rate=7.238e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:16:45,879 (deepspeed_trainer:228) INFO: 31epoch:train:8101-8200batch: iter_time=1.120e-04, loss_ctc=63.076, loss_att=49.511, acc=0.718, loss=53.585, grad_norm=5.416, loss_scale=1.000, learning_rate=7.238e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-08 04:17:18,565 (deepspeed_trainer:228) INFO: 31epoch:train:8201-8300batch: iter_time=1.137e-04, loss_ctc=73.129, loss_att=51.800, acc=0.719, loss=58.191, grad_norm=5.450, loss_scale=1.000, learning_rate=7.237e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 04:17:51,052 (deepspeed_trainer:228) INFO: 31epoch:train:8301-8400batch: iter_time=1.092e-04, loss_ctc=64.180, loss_att=50.324, acc=0.706, loss=54.452, grad_norm=5.775, loss_scale=1.000, learning_rate=7.236e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-08 04:18:24,072 (deepspeed_trainer:228) INFO: 31epoch:train:8401-8500batch: iter_time=1.107e-04, loss_ctc=74.699, loss_att=59.204, acc=0.712, loss=63.866, grad_norm=5.619, loss_scale=1.000, learning_rate=7.235e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:18:56,718 (deepspeed_trainer:228) INFO: 31epoch:train:8501-8600batch: iter_time=1.092e-04, loss_ctc=68.177, loss_att=49.199, acc=0.717, loss=54.879, grad_norm=6.017, loss_scale=1.000, learning_rate=7.235e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 04:19:29,346 (deepspeed_trainer:228) INFO: 31epoch:train:8601-8700batch: iter_time=1.090e-04, loss_ctc=68.755, loss_att=51.755, acc=0.724, loss=56.859, grad_norm=5.449, loss_scale=1.000, learning_rate=7.234e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 04:20:02,390 (deepspeed_trainer:228) INFO: 31epoch:train:8701-8800batch: iter_time=1.088e-04, loss_ctc=76.300, loss_att=60.816, acc=0.719, loss=65.464, grad_norm=5.571, loss_scale=1.000, learning_rate=7.233e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:20:35,150 (deepspeed_trainer:228) INFO: 31epoch:train:8801-8900batch: iter_time=1.088e-04, loss_ctc=70.521, loss_att=56.407, acc=0.713, loss=60.633, grad_norm=5.955, loss_scale=1.000, learning_rate=7.232e-05, step_time=0.327 [2024-12-08 04:21:08,096] [INFO] [logging.py:129:log_dist] [Rank 0] step=459000, skipped=0, lr=[np.float64(7.231007383711801e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:21:08,097] [INFO] [timer.py:264:stop] epoch=0/micro_step=69000/global_step=69000, RunningAvgSamplesPerSec=49.28977612453213, CurrSamplesPerSec=49.481328892826134, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:21:08,100 (deepspeed_trainer:228) INFO: 31epoch:train:8901-9000batch: iter_time=1.098e-04, loss_ctc=75.317, loss_att=58.208, acc=0.713, loss=63.307, grad_norm=7.362, loss_scale=1.000, learning_rate=7.231e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:21:40,949 (deepspeed_trainer:228) INFO: 31epoch:train:9001-9100batch: iter_time=1.095e-04, loss_ctc=66.312, loss_att=49.104, acc=0.716, loss=54.260, grad_norm=5.849, loss_scale=1.000, learning_rate=7.231e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:22:13,984 (deepspeed_trainer:228) INFO: 31epoch:train:9101-9200batch: iter_time=1.106e-04, loss_ctc=67.289, loss_att=49.492, acc=0.739, loss=54.841, grad_norm=4.897, loss_scale=1.000, learning_rate=7.230e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:22:46,606 (deepspeed_trainer:228) INFO: 31epoch:train:9201-9300batch: iter_time=1.122e-04, loss_ctc=68.190, loss_att=52.921, acc=0.719, loss=57.527, grad_norm=5.683, loss_scale=1.000, learning_rate=7.229e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 04:23:16,329 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 04:23:42,872 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 04:23:58,996 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 04:23:58,996 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 04:23:58,998 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 04:24:22,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:24,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:25,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:24,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:25,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:25,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:27,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:28,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:28,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:29,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:28,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:28,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:28,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:29,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:30,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:24:31,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:10,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:12,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:13,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:13,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:16,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:17,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:17,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:18,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:19,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:19,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:18,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:19,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:20,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:19,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:19,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:20,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:25:57,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:01,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:01,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:02,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:07,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:07,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:07,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:08,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:08,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:08,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:08,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:08,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:09,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:09,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:11,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:13,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:44,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:48,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:48,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:50,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:56,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:56,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:57,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:57,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:58,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:58,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:58,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:59,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:59,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:26:59,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:27:00,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:27:06,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 04:27:41,508 (deepspeed_trainer:228) INFO: 31epoch:train:9301-9400batch: iter_time=2.555, loss_ctc=76.666, loss_att=56.838, acc=0.716, loss=62.737, grad_norm=5.097, loss_scale=1.000, learning_rate=7.228e-05, step_time=0.393 [cnode7-012:0/16] 2024-12-08 04:28:14,478 (deepspeed_trainer:228) INFO: 31epoch:train:9401-9500batch: iter_time=1.132e-04, loss_ctc=69.630, loss_att=54.918, acc=0.705, loss=59.332, grad_norm=5.483, loss_scale=1.000, learning_rate=7.227e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:28:47,108 (deepspeed_trainer:228) INFO: 31epoch:train:9501-9600batch: iter_time=1.104e-04, loss_ctc=63.926, loss_att=44.403, acc=0.721, loss=50.254, grad_norm=5.511, loss_scale=1.000, learning_rate=7.227e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 04:29:20,379 (deepspeed_trainer:228) INFO: 31epoch:train:9601-9700batch: iter_time=1.135e-04, loss_ctc=70.419, loss_att=54.742, acc=0.709, loss=59.436, grad_norm=5.480, loss_scale=1.000, learning_rate=7.226e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 04:29:53,426 (deepspeed_trainer:228) INFO: 31epoch:train:9701-9800batch: iter_time=1.111e-04, loss_ctc=70.141, loss_att=58.614, acc=0.701, loss=62.084, grad_norm=5.538, loss_scale=1.000, learning_rate=7.225e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:30:26,560 (deepspeed_trainer:228) INFO: 31epoch:train:9801-9900batch: iter_time=1.129e-04, loss_ctc=74.739, loss_att=61.204, acc=0.708, loss=65.257, grad_norm=5.963, loss_scale=1.000, learning_rate=7.224e-05, step_time=0.331 [2024-12-08 04:30:59,445] [INFO] [logging.py:129:log_dist] [Rank 0] step=460000, skipped=0, lr=[np.float64(7.223143333907665e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:30:59,446] [INFO] [timer.py:264:stop] epoch=0/micro_step=70000/global_step=70000, RunningAvgSamplesPerSec=49.28754700184331, CurrSamplesPerSec=51.35782113992427, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:30:59,448 (deepspeed_trainer:228) INFO: 31epoch:train:9901-10000batch: iter_time=1.124e-04, loss_ctc=70.835, loss_att=55.140, acc=0.712, loss=59.861, grad_norm=5.748, loss_scale=1.000, learning_rate=7.224e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:31:33,312 (deepspeed_trainer:228) INFO: 31epoch:train:10001-10100batch: iter_time=1.143e-04, loss_ctc=66.229, loss_att=51.386, acc=0.717, loss=55.868, grad_norm=4.856, loss_scale=1.000, learning_rate=7.223e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 04:32:06,354 (deepspeed_trainer:228) INFO: 31epoch:train:10101-10200batch: iter_time=1.137e-04, loss_ctc=73.577, loss_att=52.514, acc=0.712, loss=58.830, grad_norm=5.263, loss_scale=1.000, learning_rate=7.222e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:32:39,277 (deepspeed_trainer:228) INFO: 31epoch:train:10201-10300batch: iter_time=1.099e-04, loss_ctc=68.370, loss_att=49.602, acc=0.722, loss=55.209, grad_norm=5.559, loss_scale=1.000, learning_rate=7.221e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:33:12,325 (deepspeed_trainer:228) INFO: 31epoch:train:10301-10400batch: iter_time=1.115e-04, loss_ctc=68.733, loss_att=54.812, acc=0.710, loss=58.980, grad_norm=5.497, loss_scale=1.000, learning_rate=7.220e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:33:45,341 (deepspeed_trainer:228) INFO: 31epoch:train:10401-10500batch: iter_time=1.161e-04, loss_ctc=69.355, loss_att=50.683, acc=0.724, loss=56.317, grad_norm=5.099, loss_scale=1.000, learning_rate=7.220e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:34:17,407 (deepspeed_trainer:228) INFO: 31epoch:train:10501-10600batch: iter_time=1.150e-04, loss_ctc=68.401, loss_att=53.358, acc=0.721, loss=57.888, grad_norm=5.306, loss_scale=1.000, learning_rate=7.219e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:34:50,402 (deepspeed_trainer:228) INFO: 31epoch:train:10601-10700batch: iter_time=1.150e-04, loss_ctc=79.087, loss_att=60.006, acc=0.720, loss=65.750, grad_norm=5.142, loss_scale=1.000, learning_rate=7.218e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:35:23,015 (deepspeed_trainer:228) INFO: 31epoch:train:10701-10800batch: iter_time=1.159e-04, loss_ctc=64.987, loss_att=50.676, acc=0.717, loss=54.955, grad_norm=5.308, loss_scale=1.000, learning_rate=7.217e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 04:35:55,709 (deepspeed_trainer:228) INFO: 31epoch:train:10801-10900batch: iter_time=1.184e-04, loss_ctc=73.151, loss_att=57.135, acc=0.715, loss=61.927, grad_norm=7.338, loss_scale=1.000, learning_rate=7.216e-05, step_time=0.327 [2024-12-08 04:36:28,427] [INFO] [logging.py:129:log_dist] [Rank 0] step=461000, skipped=0, lr=[np.float64(7.215304885958986e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:36:28,427] [INFO] [timer.py:264:stop] epoch=0/micro_step=71000/global_step=71000, RunningAvgSamplesPerSec=49.30077068893674, CurrSamplesPerSec=51.74315491838076, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:36:28,430 (deepspeed_trainer:228) INFO: 31epoch:train:10901-11000batch: iter_time=1.145e-04, loss_ctc=70.918, loss_att=52.252, acc=0.722, loss=57.867, grad_norm=5.754, loss_scale=1.000, learning_rate=7.216e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 04:37:01,738 (deepspeed_trainer:228) INFO: 31epoch:train:11001-11100batch: iter_time=1.341e-04, loss_ctc=60.923, loss_att=45.114, acc=0.738, loss=49.855, grad_norm=5.294, loss_scale=1.000, learning_rate=7.215e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 04:37:34,693 (deepspeed_trainer:228) INFO: 31epoch:train:11101-11200batch: iter_time=1.107e-04, loss_ctc=73.891, loss_att=56.357, acc=0.712, loss=61.583, grad_norm=5.325, loss_scale=1.000, learning_rate=7.214e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:37:56,165 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 04:38:23,560 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 04:38:40,590 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 04:38:40,590 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 04:38:40,593 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 04:39:04,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:05,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:04,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:05,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:05,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:07,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:08,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:08,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:08,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:09,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:08,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:08,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:09,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:10,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:11,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:11,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:52,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:52,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:52,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:53,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:53,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:56,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:56,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:57,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:57,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:58,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:57,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:58,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:59,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:00,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:39:59,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:00,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:41,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:41,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:41,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:42,136] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:45,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:46,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:47,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:47,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:47,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:47,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:48,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:48,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:48,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:48,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:51,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:40:57,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:27,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:28,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:32,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:32,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:36,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:35,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:36,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:36,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:37,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:36,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:37,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:37,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:37,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:37,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:39,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:41:50,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 04:42:32,907 (deepspeed_trainer:228) INFO: 31epoch:train:11201-11300batch: iter_time=2.523, loss_ctc=75.634, loss_att=57.227, acc=0.715, loss=62.742, grad_norm=5.530, loss_scale=1.000, learning_rate=7.213e-05, step_time=0.447 [cnode7-012:0/16] 2024-12-08 04:43:06,262 (deepspeed_trainer:228) INFO: 31epoch:train:11301-11400batch: iter_time=1.104e-04, loss_ctc=72.201, loss_att=55.873, acc=0.704, loss=60.797, grad_norm=5.400, loss_scale=1.000, learning_rate=7.213e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 04:43:39,429 (deepspeed_trainer:228) INFO: 31epoch:train:11401-11500batch: iter_time=1.050e-04, loss_ctc=59.596, loss_att=41.726, acc=0.729, loss=47.083, grad_norm=5.298, loss_scale=1.000, learning_rate=7.212e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:44:12,507 (deepspeed_trainer:228) INFO: 31epoch:train:11501-11600batch: iter_time=1.101e-04, loss_ctc=71.801, loss_att=58.470, acc=0.695, loss=62.477, grad_norm=5.470, loss_scale=1.000, learning_rate=7.211e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:44:45,609 (deepspeed_trainer:228) INFO: 31epoch:train:11601-11700batch: iter_time=1.093e-04, loss_ctc=77.599, loss_att=62.590, acc=0.706, loss=67.083, grad_norm=5.551, loss_scale=1.000, learning_rate=7.210e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:45:18,592 (deepspeed_trainer:228) INFO: 31epoch:train:11701-11800batch: iter_time=1.106e-04, loss_ctc=68.357, loss_att=57.825, acc=0.707, loss=60.992, grad_norm=5.282, loss_scale=1.000, learning_rate=7.209e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:45:51,437 (deepspeed_trainer:228) INFO: 31epoch:train:11801-11900batch: iter_time=1.103e-04, loss_ctc=66.200, loss_att=48.512, acc=0.720, loss=53.813, grad_norm=5.272, loss_scale=1.000, learning_rate=7.209e-05, step_time=0.328 [2024-12-08 04:46:24,598] [INFO] [logging.py:129:log_dist] [Rank 0] step=462000, skipped=0, lr=[np.float64(7.207491901252782e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:46:24,599] [INFO] [timer.py:264:stop] epoch=0/micro_step=72000/global_step=72000, RunningAvgSamplesPerSec=49.284141123993365, CurrSamplesPerSec=46.943713365405245, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:46:24,602 (deepspeed_trainer:228) INFO: 31epoch:train:11901-12000batch: iter_time=1.111e-04, loss_ctc=67.986, loss_att=53.478, acc=0.722, loss=57.833, grad_norm=5.772, loss_scale=1.000, learning_rate=7.208e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:46:57,704 (deepspeed_trainer:228) INFO: 31epoch:train:12001-12100batch: iter_time=1.107e-04, loss_ctc=73.147, loss_att=50.302, acc=0.718, loss=57.169, grad_norm=5.728, loss_scale=1.000, learning_rate=7.207e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 04:47:30,606 (deepspeed_trainer:228) INFO: 31epoch:train:12101-12200batch: iter_time=1.088e-04, loss_ctc=67.198, loss_att=51.540, acc=0.716, loss=56.252, grad_norm=5.154, loss_scale=1.000, learning_rate=7.206e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:48:03,595 (deepspeed_trainer:228) INFO: 31epoch:train:12201-12300batch: iter_time=1.102e-04, loss_ctc=71.239, loss_att=54.314, acc=0.711, loss=59.396, grad_norm=5.132, loss_scale=1.000, learning_rate=7.206e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 04:48:36,461 (deepspeed_trainer:228) INFO: 31epoch:train:12301-12400batch: iter_time=1.106e-04, loss_ctc=66.987, loss_att=50.046, acc=0.724, loss=55.130, grad_norm=5.413, loss_scale=1.000, learning_rate=7.205e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:49:09,438 (deepspeed_trainer:228) INFO: 31epoch:train:12401-12500batch: iter_time=1.137e-04, loss_ctc=73.451, loss_att=60.286, acc=0.719, loss=64.207, grad_norm=5.601, loss_scale=1.000, learning_rate=7.204e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:49:42,257 (deepspeed_trainer:228) INFO: 31epoch:train:12501-12600batch: iter_time=1.104e-04, loss_ctc=70.809, loss_att=49.813, acc=0.731, loss=56.109, grad_norm=5.482, loss_scale=1.000, learning_rate=7.203e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 04:50:15,226 (deepspeed_trainer:228) INFO: 31epoch:train:12601-12700batch: iter_time=1.154e-04, loss_ctc=70.831, loss_att=55.220, acc=0.715, loss=59.915, grad_norm=6.123, loss_scale=1.000, learning_rate=7.202e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:50:48,156 (deepspeed_trainer:228) INFO: 31epoch:train:12701-12800batch: iter_time=1.116e-04, loss_ctc=68.313, loss_att=52.751, acc=0.718, loss=57.415, grad_norm=7.126, loss_scale=1.000, learning_rate=7.202e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 04:51:21,121 (deepspeed_trainer:228) INFO: 31epoch:train:12801-12900batch: iter_time=1.099e-04, loss_ctc=72.409, loss_att=53.283, acc=0.725, loss=59.016, grad_norm=6.183, loss_scale=1.000, learning_rate=7.201e-05, step_time=0.329 [2024-12-08 04:51:53,719] [INFO] [logging.py:129:log_dist] [Rank 0] step=463000, skipped=0, lr=[np.float64(7.19970424222447e-05)], mom=[[0.9, 0.98]] [2024-12-08 04:51:53,720] [INFO] [timer.py:264:stop] epoch=0/micro_step=73000/global_step=73000, RunningAvgSamplesPerSec=49.2954546517598, CurrSamplesPerSec=50.724412414461106, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 04:51:53,723 (deepspeed_trainer:228) INFO: 31epoch:train:12901-13000batch: iter_time=1.096e-04, loss_ctc=64.679, loss_att=46.595, acc=0.731, loss=52.028, grad_norm=6.260, loss_scale=1.000, learning_rate=7.200e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 04:52:26,938 (deepspeed_trainer:228) INFO: 31epoch:train:13001-13100batch: iter_time=1.142e-04, loss_ctc=76.238, loss_att=55.801, acc=0.718, loss=61.936, grad_norm=5.239, loss_scale=1.000, learning_rate=7.199e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 04:52:39,790 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 04:53:07,296 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 04:53:23,268 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 04:53:23,268 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 04:53:23,270 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 04:53:47,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:48,522] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:48,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:48,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:48,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:48,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:50,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:50,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:50,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:51,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:51,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:52,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:53,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:53,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:53,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:53:55,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:36,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:37,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:38,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:38,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:39,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:40,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:41,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:41,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:42,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:42,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:42,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:42,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:42,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:43,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:44,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:54:44,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:24,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:24,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:26,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:28,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:28,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:29,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:30,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:31,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:33,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:33,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:33,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:33,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:33,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:35,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:35,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:55:36,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:11,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:11,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:16,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:17,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:17,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:18,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:19,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:20,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:21,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:22,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:22,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:22,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:23,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:24,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:26,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 04:56:26,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 04:57:18,772 (deepspeed_trainer:228) INFO: 31epoch:train:13101-13200batch: iter_time=2.547, loss_ctc=72.394, loss_att=59.612, acc=0.711, loss=63.447, grad_norm=5.615, loss_scale=1.000, learning_rate=7.199e-05, step_time=0.371 [cnode7-012:0/16] 2024-12-08 04:57:52,403 (deepspeed_trainer:228) INFO: 31epoch:train:13201-13300batch: iter_time=1.177e-04, loss_ctc=69.782, loss_att=53.515, acc=0.724, loss=58.381, grad_norm=5.633, loss_scale=1.000, learning_rate=7.198e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 04:58:25,145 (deepspeed_trainer:228) INFO: 31epoch:train:13301-13400batch: iter_time=1.091e-04, loss_ctc=63.600, loss_att=47.218, acc=0.724, loss=52.142, grad_norm=6.189, loss_scale=1.000, learning_rate=7.197e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 04:58:58,566 (deepspeed_trainer:228) INFO: 31epoch:train:13401-13500batch: iter_time=1.119e-04, loss_ctc=67.590, loss_att=55.873, acc=0.703, loss=59.394, grad_norm=5.773, loss_scale=1.000, learning_rate=7.196e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 04:59:31,672 (deepspeed_trainer:228) INFO: 31epoch:train:13501-13600batch: iter_time=1.113e-04, loss_ctc=75.023, loss_att=63.947, acc=0.708, loss=67.257, grad_norm=6.228, loss_scale=1.000, learning_rate=7.195e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:00:03,842 (deepspeed_trainer:228) INFO: 31epoch:train:13601-13700batch: iter_time=1.129e-04, loss_ctc=76.606, loss_att=63.123, acc=0.715, loss=67.175, grad_norm=5.143, loss_scale=1.000, learning_rate=7.195e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 05:00:36,488 (deepspeed_trainer:228) INFO: 31epoch:train:13701-13800batch: iter_time=1.100e-04, loss_ctc=60.244, loss_att=47.027, acc=0.725, loss=50.961, grad_norm=4.924, loss_scale=1.000, learning_rate=7.194e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 05:01:09,827 (deepspeed_trainer:228) INFO: 31epoch:train:13801-13900batch: iter_time=1.159e-04, loss_ctc=70.267, loss_att=54.662, acc=0.723, loss=59.358, grad_norm=5.809, loss_scale=1.000, learning_rate=7.193e-05, step_time=0.329 [2024-12-08 05:01:43,088] [INFO] [logging.py:129:log_dist] [Rank 0] step=464000, skipped=0, lr=[np.float64(7.191941772347684e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:01:43,089] [INFO] [timer.py:264:stop] epoch=0/micro_step=74000/global_step=74000, RunningAvgSamplesPerSec=49.29574360701699, CurrSamplesPerSec=52.33503874004821, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:01:43,091 (deepspeed_trainer:228) INFO: 31epoch:train:13901-14000batch: iter_time=1.111e-04, loss_ctc=69.959, loss_att=50.076, acc=0.719, loss=56.045, grad_norm=7.418, loss_scale=1.000, learning_rate=7.192e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 05:02:16,165 (deepspeed_trainer:228) INFO: 31epoch:train:14001-14100batch: iter_time=1.150e-04, loss_ctc=71.036, loss_att=54.276, acc=0.721, loss=59.324, grad_norm=6.655, loss_scale=1.000, learning_rate=7.192e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:02:49,706 (deepspeed_trainer:228) INFO: 31epoch:train:14101-14200batch: iter_time=1.116e-04, loss_ctc=70.434, loss_att=53.321, acc=0.717, loss=58.433, grad_norm=6.603, loss_scale=1.000, learning_rate=7.191e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 05:03:23,050 (deepspeed_trainer:228) INFO: 31epoch:train:14201-14300batch: iter_time=1.134e-04, loss_ctc=68.581, loss_att=51.947, acc=0.727, loss=56.949, grad_norm=6.144, loss_scale=1.000, learning_rate=7.190e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 05:03:56,563 (deepspeed_trainer:228) INFO: 31epoch:train:14301-14400batch: iter_time=1.119e-04, loss_ctc=73.933, loss_att=59.946, acc=0.729, loss=64.134, grad_norm=6.666, loss_scale=1.000, learning_rate=7.189e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 05:04:29,998 (deepspeed_trainer:228) INFO: 31epoch:train:14401-14500batch: iter_time=1.117e-04, loss_ctc=73.205, loss_att=59.557, acc=0.721, loss=63.681, grad_norm=5.857, loss_scale=1.000, learning_rate=7.188e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 05:05:03,086 (deepspeed_trainer:228) INFO: 31epoch:train:14501-14600batch: iter_time=1.107e-04, loss_ctc=73.372, loss_att=55.288, acc=0.720, loss=60.706, grad_norm=7.144, loss_scale=1.000, learning_rate=7.188e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:05:36,034 (deepspeed_trainer:228) INFO: 31epoch:train:14601-14700batch: iter_time=1.130e-04, loss_ctc=60.919, loss_att=44.826, acc=0.731, loss=49.660, grad_norm=5.779, loss_scale=1.000, learning_rate=7.187e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 05:06:09,281 (deepspeed_trainer:228) INFO: 31epoch:train:14701-14800batch: iter_time=1.125e-04, loss_ctc=70.985, loss_att=54.578, acc=0.731, loss=59.483, grad_norm=5.702, loss_scale=1.000, learning_rate=7.186e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 05:06:42,716 (deepspeed_trainer:228) INFO: 31epoch:train:14801-14900batch: iter_time=1.120e-04, loss_ctc=67.543, loss_att=51.930, acc=0.729, loss=56.609, grad_norm=5.536, loss_scale=1.000, learning_rate=7.185e-05, step_time=0.334 [2024-12-08 05:07:15,933] [INFO] [logging.py:129:log_dist] [Rank 0] step=465000, skipped=0, lr=[np.float64(7.18420435612423e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:07:15,934] [INFO] [timer.py:264:stop] epoch=0/micro_step=75000/global_step=75000, RunningAvgSamplesPerSec=49.29981944429331, CurrSamplesPerSec=53.38942617218161, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:07:15,936 (deepspeed_trainer:228) INFO: 31epoch:train:14901-15000batch: iter_time=1.139e-04, loss_ctc=76.323, loss_att=55.900, acc=0.725, loss=62.038, grad_norm=4.978, loss_scale=1.000, learning_rate=7.185e-05, step_time=0.332 [2024-12-08 05:07:32,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:33,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:34,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:48,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:48,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:48,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:48,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:49,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:49,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:49,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:49,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:50,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:50,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:50,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:50,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:50,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:51,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:51,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:07:51,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:03,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:04,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:04,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:04,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:04,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:05,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:05,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:05,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:06,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:05,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:06,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:06,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:06,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:06,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:06,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:07,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:19,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:19,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:19,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:20,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:20,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:21,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:20,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:21,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:21,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:22,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:21,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:22,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:22,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:22,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:22,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:23,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:08:35,766] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 31 is about to be saved! [2024-12-08 05:08:35,798] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/mp_rank_00_model_states.pt [2024-12-08 05:08:35,798] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/mp_rank_00_model_states.pt... [2024-12-08 05:08:37,625] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/mp_rank_00_model_states.pt. [2024-12-08 05:08:37,779] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,119] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,121] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,783] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,783] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,783] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,122] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,784] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,784] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,123] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,785] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,124] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,125] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,126] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,126] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 05:08:37,788] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 05:08:38,512] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,512] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,512] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,529] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,532] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,532] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,532] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,536] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,536] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,882] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,882] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,882] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,882] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,882] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,882] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,886] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,886] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,886] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,889] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,889] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,889] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,897] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,897] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,897] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,571] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,571] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,572] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,577] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,577] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,577] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,916] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,916] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,916] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,587] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,587] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,587] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,930] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,930] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,930] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,934] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,934] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,934] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,598] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,598] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,598] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [2024-12-08 05:08:38,608] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 05:08:38,608] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_31/31/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 05:08:38,608] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 31 is ready now! [cnode7-012:0/16] 2024-12-08 05:08:38,619 (deepspeed_trainer:158) INFO: 31epoch results: [train] iter_time=0.135, loss_ctc=70.687, loss_att=54.307, acc=0.717, loss=59.219, grad_norm=5.686, loss_scale=1.000, learning_rate=7.243e-05, step_time=0.334, time=1 hour, 57 minutes and 26.82 seconds, total_count=465031, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.422, cer_ctc=0.112, loss_att=5.250, acc=0.915, cer=0.104, wer=0.812, loss=4.719, time=1 minute and 9.73 seconds, total_count=31, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 05:08:40,075 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 05:09:08,898 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 05:09:25,005 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 05:09:25,005 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 05:09:25,007 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 05:09:44,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:45,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:46,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:45,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:47,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:47,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:48,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:46,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:48,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:48,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:48,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:49,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:49,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:50,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:50,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:09:50,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:33,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:34,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:36,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:35,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:37,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:37,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:37,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:38,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:37,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:39,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:39,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:38,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:38,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:40,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:41,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:10:41,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:21,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:22,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:25,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:25,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:26,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:26,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:25,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:26,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:27,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:26,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:27,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:28,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:29,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:28,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:30,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:11:31,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:08,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:10,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:13,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:13,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:14,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:15,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:14,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:15,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:16,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:15,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:18,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:16,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:16,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:17,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:20,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:12:22,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 05:13:22,535 (deepspeed_trainer:228) INFO: 32epoch:train:1-100batch: iter_time=2.447, loss_ctc=71.772, loss_att=56.992, acc=0.727, loss=61.420, grad_norm=5.156, loss_scale=1.000, learning_rate=7.184e-05, step_time=0.387 [cnode7-012:0/16] 2024-12-08 05:13:56,154 (deepspeed_trainer:228) INFO: 32epoch:train:101-200batch: iter_time=1.141e-04, loss_ctc=74.645, loss_att=59.273, acc=0.714, loss=63.869, grad_norm=6.898, loss_scale=1.000, learning_rate=7.183e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 05:14:29,612 (deepspeed_trainer:228) INFO: 32epoch:train:201-300batch: iter_time=1.175e-04, loss_ctc=61.544, loss_att=46.576, acc=0.724, loss=51.074, grad_norm=6.104, loss_scale=1.000, learning_rate=7.182e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 05:15:02,955 (deepspeed_trainer:228) INFO: 32epoch:train:301-400batch: iter_time=1.076e-04, loss_ctc=69.637, loss_att=52.434, acc=0.728, loss=57.601, grad_norm=6.037, loss_scale=1.000, learning_rate=7.181e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 05:15:36,385 (deepspeed_trainer:228) INFO: 32epoch:train:401-500batch: iter_time=1.113e-04, loss_ctc=68.246, loss_att=55.867, acc=0.721, loss=59.603, grad_norm=6.124, loss_scale=1.000, learning_rate=7.181e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 05:16:09,715 (deepspeed_trainer:228) INFO: 32epoch:train:501-600batch: iter_time=1.128e-04, loss_ctc=75.648, loss_att=62.416, acc=0.697, loss=66.383, grad_norm=6.918, loss_scale=1.000, learning_rate=7.180e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 05:16:42,563 (deepspeed_trainer:228) INFO: 32epoch:train:601-700batch: iter_time=1.155e-04, loss_ctc=64.868, loss_att=49.660, acc=0.714, loss=54.198, grad_norm=5.754, loss_scale=1.000, learning_rate=7.179e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 05:17:15,971 (deepspeed_trainer:228) INFO: 32epoch:train:701-800batch: iter_time=1.127e-04, loss_ctc=80.917, loss_att=62.752, acc=0.706, loss=68.196, grad_norm=6.805, loss_scale=1.000, learning_rate=7.178e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 05:17:49,229 (deepspeed_trainer:228) INFO: 32epoch:train:801-900batch: iter_time=1.121e-04, loss_ctc=70.059, loss_att=53.637, acc=0.715, loss=58.563, grad_norm=6.813, loss_scale=1.000, learning_rate=7.178e-05, step_time=0.332 [2024-12-08 05:18:22,804] [INFO] [logging.py:129:log_dist] [Rank 0] step=466000, skipped=0, lr=[np.float64(7.176491859074148e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:18:22,805] [INFO] [timer.py:264:stop] epoch=0/micro_step=76000/global_step=76000, RunningAvgSamplesPerSec=49.291414815036184, CurrSamplesPerSec=46.47092935862829, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:18:22,808 (deepspeed_trainer:228) INFO: 32epoch:train:901-1000batch: iter_time=1.173e-04, loss_ctc=88.129, loss_att=59.209, acc=0.713, loss=67.892, grad_norm=6.380, loss_scale=1.000, learning_rate=7.177e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 05:18:56,062 (deepspeed_trainer:228) INFO: 32epoch:train:1001-1100batch: iter_time=1.129e-04, loss_ctc=75.823, loss_att=54.569, acc=0.704, loss=60.955, grad_norm=6.117, loss_scale=1.000, learning_rate=7.176e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 05:19:29,181 (deepspeed_trainer:228) INFO: 32epoch:train:1101-1200batch: iter_time=1.117e-04, loss_ctc=71.519, loss_att=55.395, acc=0.713, loss=60.257, grad_norm=7.282, loss_scale=1.000, learning_rate=7.175e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:20:02,377 (deepspeed_trainer:228) INFO: 32epoch:train:1201-1300batch: iter_time=1.199e-04, loss_ctc=76.768, loss_att=58.602, acc=0.710, loss=64.069, grad_norm=6.931, loss_scale=1.000, learning_rate=7.175e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:20:35,874 (deepspeed_trainer:228) INFO: 32epoch:train:1301-1400batch: iter_time=1.188e-04, loss_ctc=79.610, loss_att=59.590, acc=0.727, loss=65.582, grad_norm=6.068, loss_scale=1.000, learning_rate=7.174e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 05:21:08,869 (deepspeed_trainer:228) INFO: 32epoch:train:1401-1500batch: iter_time=1.227e-04, loss_ctc=67.056, loss_att=50.360, acc=0.719, loss=55.373, grad_norm=6.169, loss_scale=1.000, learning_rate=7.173e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:21:42,327 (deepspeed_trainer:228) INFO: 32epoch:train:1501-1600batch: iter_time=1.139e-04, loss_ctc=75.200, loss_att=56.988, acc=0.711, loss=62.458, grad_norm=6.764, loss_scale=1.000, learning_rate=7.172e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 05:22:16,701 (deepspeed_trainer:228) INFO: 32epoch:train:1601-1700batch: iter_time=1.263e-04, loss_ctc=68.919, loss_att=56.007, acc=0.723, loss=59.874, grad_norm=5.495, loss_scale=1.000, learning_rate=7.171e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-08 05:22:50,336 (deepspeed_trainer:228) INFO: 32epoch:train:1701-1800batch: iter_time=1.098e-04, loss_ctc=71.809, loss_att=53.486, acc=0.726, loss=58.994, grad_norm=6.465, loss_scale=1.000, learning_rate=7.171e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 05:23:20,475 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 05:23:46,549 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 05:24:03,855 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 05:24:03,855 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 05:24:03,857 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 05:24:28,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:29,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:29,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:30,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:30,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:32,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:31,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:33,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:33,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:33,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:33,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:34,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:33,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:34,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:34,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:24:40,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:17,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:18,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:18,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:20,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:20,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:20,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:22,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:22,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:21,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:22,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:23,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:24,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:23,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:23,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:26,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:25:28,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:05,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:06,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:09,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:10,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:10,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:10,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:10,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:11,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:12,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:13,167] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:13,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:12,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:13,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:15,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:17,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:16,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:53,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:54,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:58,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:58,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:59,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:00,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:26:59,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:01,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:01,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:01,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:00,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:03,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:06,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:06,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:05,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:27:07,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 05:27:41,746 (deepspeed_trainer:228) INFO: 32epoch:train:1801-1900batch: iter_time=2.546, loss_ctc=77.978, loss_att=59.097, acc=0.722, loss=64.785, grad_norm=7.123, loss_scale=1.000, learning_rate=7.170e-05, step_time=0.367 [2024-12-08 05:28:14,803] [INFO] [logging.py:129:log_dist] [Rank 0] step=467000, skipped=0, lr=[np.float64(7.168804147725896e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:28:14,804] [INFO] [timer.py:264:stop] epoch=0/micro_step=77000/global_step=77000, RunningAvgSamplesPerSec=49.286509703396966, CurrSamplesPerSec=52.76832316916011, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:28:14,806 (deepspeed_trainer:228) INFO: 32epoch:train:1901-2000batch: iter_time=1.126e-04, loss_ctc=72.832, loss_att=57.360, acc=0.724, loss=61.977, grad_norm=5.792, loss_scale=1.000, learning_rate=7.169e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:28:47,894 (deepspeed_trainer:228) INFO: 32epoch:train:2001-2100batch: iter_time=1.083e-04, loss_ctc=74.192, loss_att=59.230, acc=0.715, loss=63.732, grad_norm=7.117, loss_scale=1.000, learning_rate=7.168e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:29:20,719 (deepspeed_trainer:228) INFO: 32epoch:train:2101-2200batch: iter_time=1.096e-04, loss_ctc=62.070, loss_att=45.158, acc=0.732, loss=50.233, grad_norm=5.698, loss_scale=1.000, learning_rate=7.168e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 05:29:53,458 (deepspeed_trainer:228) INFO: 32epoch:train:2201-2300batch: iter_time=1.098e-04, loss_ctc=65.056, loss_att=50.993, acc=0.732, loss=55.209, grad_norm=6.064, loss_scale=1.000, learning_rate=7.167e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 05:30:26,485 (deepspeed_trainer:228) INFO: 32epoch:train:2301-2400batch: iter_time=1.115e-04, loss_ctc=73.188, loss_att=60.734, acc=0.711, loss=64.472, grad_norm=5.515, loss_scale=1.000, learning_rate=7.166e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:30:59,392 (deepspeed_trainer:228) INFO: 32epoch:train:2401-2500batch: iter_time=1.071e-04, loss_ctc=70.402, loss_att=57.740, acc=0.707, loss=61.525, grad_norm=5.802, loss_scale=1.000, learning_rate=7.165e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 05:31:32,030 (deepspeed_trainer:228) INFO: 32epoch:train:2501-2600batch: iter_time=1.071e-04, loss_ctc=67.015, loss_att=54.224, acc=0.702, loss=58.061, grad_norm=5.773, loss_scale=1.000, learning_rate=7.165e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 05:32:05,001 (deepspeed_trainer:228) INFO: 32epoch:train:2601-2700batch: iter_time=1.096e-04, loss_ctc=78.832, loss_att=60.081, acc=0.719, loss=65.727, grad_norm=5.825, loss_scale=1.000, learning_rate=7.164e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:32:38,210 (deepspeed_trainer:228) INFO: 32epoch:train:2701-2800batch: iter_time=1.089e-04, loss_ctc=69.629, loss_att=52.633, acc=0.710, loss=57.748, grad_norm=5.715, loss_scale=1.000, learning_rate=7.163e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 05:33:11,575 (deepspeed_trainer:228) INFO: 32epoch:train:2801-2900batch: iter_time=1.085e-04, loss_ctc=86.498, loss_att=58.501, acc=0.721, loss=66.870, grad_norm=6.038, loss_scale=1.000, learning_rate=7.162e-05, step_time=0.333 [2024-12-08 05:33:44,318] [INFO] [logging.py:129:log_dist] [Rank 0] step=468000, skipped=0, lr=[np.float64(7.161141089606654e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:33:44,319] [INFO] [timer.py:264:stop] epoch=0/micro_step=78000/global_step=78000, RunningAvgSamplesPerSec=49.29616217275243, CurrSamplesPerSec=47.543443917203625, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:33:44,321 (deepspeed_trainer:228) INFO: 32epoch:train:2901-3000batch: iter_time=1.064e-04, loss_ctc=74.919, loss_att=55.439, acc=0.708, loss=61.314, grad_norm=6.141, loss_scale=1.000, learning_rate=7.162e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 05:34:17,005 (deepspeed_trainer:228) INFO: 32epoch:train:3001-3100batch: iter_time=1.104e-04, loss_ctc=68.874, loss_att=50.985, acc=0.717, loss=56.364, grad_norm=6.022, loss_scale=1.000, learning_rate=7.161e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 05:34:50,227 (deepspeed_trainer:228) INFO: 32epoch:train:3101-3200batch: iter_time=1.066e-04, loss_ctc=86.389, loss_att=67.219, acc=0.707, loss=72.973, grad_norm=6.686, loss_scale=1.000, learning_rate=7.160e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 05:35:24,300 (deepspeed_trainer:228) INFO: 32epoch:train:3201-3300batch: iter_time=1.095e-04, loss_ctc=70.743, loss_att=53.590, acc=0.734, loss=58.728, grad_norm=5.670, loss_scale=1.000, learning_rate=7.159e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:35:56,868 (deepspeed_trainer:228) INFO: 32epoch:train:3301-3400batch: iter_time=1.086e-04, loss_ctc=67.257, loss_att=49.942, acc=0.718, loss=55.135, grad_norm=5.598, loss_scale=1.000, learning_rate=7.158e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 05:36:29,801 (deepspeed_trainer:228) INFO: 32epoch:train:3401-3500batch: iter_time=1.098e-04, loss_ctc=74.811, loss_att=60.785, acc=0.710, loss=64.970, grad_norm=5.354, loss_scale=1.000, learning_rate=7.158e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 05:37:02,603 (deepspeed_trainer:228) INFO: 32epoch:train:3501-3600batch: iter_time=1.078e-04, loss_ctc=65.691, loss_att=48.715, acc=0.731, loss=53.807, grad_norm=5.211, loss_scale=1.000, learning_rate=7.157e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 05:37:35,588 (deepspeed_trainer:228) INFO: 32epoch:train:3601-3700batch: iter_time=1.090e-04, loss_ctc=74.906, loss_att=56.463, acc=0.724, loss=61.999, grad_norm=5.662, loss_scale=1.000, learning_rate=7.156e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 05:37:55,777 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 05:38:21,871 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 05:38:38,172 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 05:38:38,172 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 05:38:38,174 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 05:39:04,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:05,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:05,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:05,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:06,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:06,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:06,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:07,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:07,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:08,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:08,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:08,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:08,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:09,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:09,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:10,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:53,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:53,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:53,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:53,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:54,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:55,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:55,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:56,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:57,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:57,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:57,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:57,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:57,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:57,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:59,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:39:59,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:40,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:41,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:41,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:42,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:44,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:44,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:45,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:45,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:45,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:45,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:46,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:46,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:46,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:46,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:47,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:40:47,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:27,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:29,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:29,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:29,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:32,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:33,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:34,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:34,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:34,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:34,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:35,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:35,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:35,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:36,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:37,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:41:41,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 05:42:24,847 (deepspeed_trainer:228) INFO: 32epoch:train:3701-3800batch: iter_time=2.502, loss_ctc=75.347, loss_att=60.331, acc=0.722, loss=64.833, grad_norm=6.780, loss_scale=1.000, learning_rate=7.155e-05, step_time=0.390 [cnode7-012:0/16] 2024-12-08 05:42:58,644 (deepspeed_trainer:228) INFO: 32epoch:train:3801-3900batch: iter_time=1.141e-04, loss_ctc=74.668, loss_att=61.734, acc=0.712, loss=65.616, grad_norm=6.162, loss_scale=1.000, learning_rate=7.155e-05, step_time=0.338 [2024-12-08 05:43:31,777] [INFO] [logging.py:129:log_dist] [Rank 0] step=469000, skipped=0, lr=[np.float64(7.153502553232723e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:43:31,778] [INFO] [timer.py:264:stop] epoch=0/micro_step=79000/global_step=79000, RunningAvgSamplesPerSec=49.29083255733385, CurrSamplesPerSec=50.97214378773213, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:43:31,780 (deepspeed_trainer:228) INFO: 32epoch:train:3901-4000batch: iter_time=1.059e-04, loss_ctc=67.607, loss_att=52.316, acc=0.713, loss=56.906, grad_norm=5.948, loss_scale=1.000, learning_rate=7.154e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:44:05,086 (deepspeed_trainer:228) INFO: 32epoch:train:4001-4100batch: iter_time=1.148e-04, loss_ctc=64.115, loss_att=49.185, acc=0.723, loss=53.650, grad_norm=5.129, loss_scale=1.000, learning_rate=7.153e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 05:44:39,432 (deepspeed_trainer:228) INFO: 32epoch:train:4101-4200batch: iter_time=1.057e-04, loss_ctc=65.078, loss_att=50.871, acc=0.728, loss=55.155, grad_norm=5.032, loss_scale=1.000, learning_rate=7.152e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-08 05:45:12,734 (deepspeed_trainer:228) INFO: 32epoch:train:4201-4300batch: iter_time=1.108e-04, loss_ctc=70.739, loss_att=58.215, acc=0.708, loss=61.960, grad_norm=5.715, loss_scale=1.000, learning_rate=7.152e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 05:45:45,885 (deepspeed_trainer:228) INFO: 32epoch:train:4301-4400batch: iter_time=1.123e-04, loss_ctc=72.593, loss_att=59.869, acc=0.697, loss=63.696, grad_norm=6.420, loss_scale=1.000, learning_rate=7.151e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:46:20,379 (deepspeed_trainer:228) INFO: 32epoch:train:4401-4500batch: iter_time=1.110e-04, loss_ctc=69.482, loss_att=55.977, acc=0.703, loss=60.032, grad_norm=5.880, loss_scale=1.000, learning_rate=7.150e-05, step_time=0.345 [cnode7-012:0/16] 2024-12-08 05:46:53,765 (deepspeed_trainer:228) INFO: 32epoch:train:4501-4600batch: iter_time=1.100e-04, loss_ctc=71.279, loss_att=53.076, acc=0.720, loss=58.517, grad_norm=5.690, loss_scale=1.000, learning_rate=7.149e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 05:47:27,262 (deepspeed_trainer:228) INFO: 32epoch:train:4601-4700batch: iter_time=1.134e-04, loss_ctc=73.323, loss_att=56.011, acc=0.703, loss=61.185, grad_norm=6.550, loss_scale=1.000, learning_rate=7.149e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 05:48:00,405 (deepspeed_trainer:228) INFO: 32epoch:train:4701-4800batch: iter_time=1.153e-04, loss_ctc=84.854, loss_att=58.686, acc=0.708, loss=66.530, grad_norm=5.928, loss_scale=1.000, learning_rate=7.148e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:48:33,343 (deepspeed_trainer:228) INFO: 32epoch:train:4801-4900batch: iter_time=1.105e-04, loss_ctc=73.102, loss_att=53.433, acc=0.704, loss=59.329, grad_norm=6.716, loss_scale=1.000, learning_rate=7.147e-05, step_time=0.329 [2024-12-08 05:49:06,423] [INFO] [logging.py:129:log_dist] [Rank 0] step=470000, skipped=0, lr=[np.float64(7.145888408100063e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:49:06,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=80000/global_step=80000, RunningAvgSamplesPerSec=49.29046037840084, CurrSamplesPerSec=44.947162417735484, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:49:06,426 (deepspeed_trainer:228) INFO: 32epoch:train:4901-5000batch: iter_time=1.148e-04, loss_ctc=71.087, loss_att=52.148, acc=0.717, loss=57.828, grad_norm=6.373, loss_scale=1.000, learning_rate=7.146e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 05:49:40,660 (deepspeed_trainer:228) INFO: 32epoch:train:5001-5100batch: iter_time=1.106e-04, loss_ctc=83.239, loss_att=65.902, acc=0.714, loss=71.114, grad_norm=6.231, loss_scale=1.000, learning_rate=7.146e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 05:50:14,213 (deepspeed_trainer:228) INFO: 32epoch:train:5101-5200batch: iter_time=1.138e-04, loss_ctc=72.285, loss_att=55.302, acc=0.722, loss=60.379, grad_norm=6.687, loss_scale=1.000, learning_rate=7.145e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 05:50:47,386 (deepspeed_trainer:228) INFO: 32epoch:train:5201-5300batch: iter_time=1.130e-04, loss_ctc=65.569, loss_att=45.855, acc=0.719, loss=51.765, grad_norm=5.412, loss_scale=1.000, learning_rate=7.144e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:51:21,113 (deepspeed_trainer:228) INFO: 32epoch:train:5301-5400batch: iter_time=1.082e-04, loss_ctc=73.373, loss_att=61.215, acc=0.712, loss=64.872, grad_norm=5.600, loss_scale=1.000, learning_rate=7.143e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 05:51:54,484 (deepspeed_trainer:228) INFO: 32epoch:train:5401-5500batch: iter_time=1.129e-04, loss_ctc=67.107, loss_att=46.660, acc=0.731, loss=52.777, grad_norm=4.886, loss_scale=1.000, learning_rate=7.142e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 05:52:27,597 (deepspeed_trainer:228) INFO: 32epoch:train:5501-5600batch: iter_time=1.092e-04, loss_ctc=80.858, loss_att=62.015, acc=0.707, loss=67.688, grad_norm=7.822, loss_scale=1.000, learning_rate=7.142e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:52:39,598 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 05:53:06,217 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 05:53:23,064 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 05:53:23,064 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 05:53:23,066 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 05:53:48,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:48,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:48,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:48,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:49,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:50,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:51,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:51,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:52,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:52,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:52,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:52,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:52,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:52,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:53,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:53:54,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:36,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:37,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:37,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:37,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:38,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:40,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:41,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:41,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:41,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:42,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:42,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:42,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:42,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:43,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:43,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:54:44,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:25,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:26,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:26,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:26,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:27,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:29,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:30,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:30,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:31,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:31,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:32,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:32,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:33,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:33,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:35,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:55:36,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:13,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:13,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:14,742] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:15,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:19,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:19,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:20,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:20,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:20,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:21,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:22,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:22,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:22,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:24,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:24,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 05:56:27,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 05:57:20,130 (deepspeed_trainer:228) INFO: 32epoch:train:5601-5700batch: iter_time=2.561, loss_ctc=72.790, loss_att=62.599, acc=0.723, loss=65.651, grad_norm=5.904, loss_scale=1.000, learning_rate=7.141e-05, step_time=0.360 [cnode7-012:0/16] 2024-12-08 05:57:53,622 (deepspeed_trainer:228) INFO: 32epoch:train:5701-5800batch: iter_time=1.061e-04, loss_ctc=69.839, loss_att=55.400, acc=0.723, loss=59.730, grad_norm=6.549, loss_scale=1.000, learning_rate=7.140e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 05:58:26,703 (deepspeed_trainer:228) INFO: 32epoch:train:5801-5900batch: iter_time=1.117e-04, loss_ctc=63.913, loss_att=48.743, acc=0.724, loss=53.283, grad_norm=6.262, loss_scale=1.000, learning_rate=7.139e-05, step_time=0.330 [2024-12-08 05:58:59,823] [INFO] [logging.py:129:log_dist] [Rank 0] step=471000, skipped=0, lr=[np.float64(7.138298524674922e-05)], mom=[[0.9, 0.98]] [2024-12-08 05:58:59,824] [INFO] [timer.py:264:stop] epoch=0/micro_step=81000/global_step=81000, RunningAvgSamplesPerSec=49.28547463456305, CurrSamplesPerSec=52.03890685305581, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 05:58:59,826 (deepspeed_trainer:228) INFO: 32epoch:train:5901-6000batch: iter_time=1.099e-04, loss_ctc=66.953, loss_att=48.501, acc=0.733, loss=54.059, grad_norm=5.070, loss_scale=1.000, learning_rate=7.139e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 05:59:32,998 (deepspeed_trainer:228) INFO: 32epoch:train:6001-6100batch: iter_time=1.068e-04, loss_ctc=67.405, loss_att=57.018, acc=0.726, loss=60.148, grad_norm=5.365, loss_scale=1.000, learning_rate=7.138e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:00:06,145 (deepspeed_trainer:228) INFO: 32epoch:train:6101-6200batch: iter_time=1.057e-04, loss_ctc=68.502, loss_att=55.209, acc=0.717, loss=59.197, grad_norm=5.264, loss_scale=1.000, learning_rate=7.137e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:00:39,569 (deepspeed_trainer:228) INFO: 32epoch:train:6201-6300batch: iter_time=1.061e-04, loss_ctc=70.916, loss_att=58.090, acc=0.702, loss=61.943, grad_norm=6.689, loss_scale=1.000, learning_rate=7.136e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:01:13,003 (deepspeed_trainer:228) INFO: 32epoch:train:6301-6400batch: iter_time=1.093e-04, loss_ctc=73.002, loss_att=59.784, acc=0.715, loss=63.748, grad_norm=5.493, loss_scale=1.000, learning_rate=7.136e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:01:46,105 (deepspeed_trainer:228) INFO: 32epoch:train:6401-6500batch: iter_time=1.128e-04, loss_ctc=68.201, loss_att=49.729, acc=0.722, loss=55.271, grad_norm=5.286, loss_scale=1.000, learning_rate=7.135e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:02:19,101 (deepspeed_trainer:228) INFO: 32epoch:train:6501-6600batch: iter_time=1.094e-04, loss_ctc=81.749, loss_att=57.835, acc=0.709, loss=64.981, grad_norm=6.259, loss_scale=1.000, learning_rate=7.134e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:02:52,261 (deepspeed_trainer:228) INFO: 32epoch:train:6601-6700batch: iter_time=1.124e-04, loss_ctc=79.552, loss_att=57.261, acc=0.717, loss=63.992, grad_norm=5.956, loss_scale=1.000, learning_rate=7.133e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:03:25,170 (deepspeed_trainer:228) INFO: 32epoch:train:6701-6800batch: iter_time=1.102e-04, loss_ctc=71.756, loss_att=53.008, acc=0.708, loss=58.611, grad_norm=6.317, loss_scale=1.000, learning_rate=7.133e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:03:58,097 (deepspeed_trainer:228) INFO: 32epoch:train:6801-6900batch: iter_time=1.070e-04, loss_ctc=71.909, loss_att=57.228, acc=0.714, loss=61.620, grad_norm=6.102, loss_scale=1.000, learning_rate=7.132e-05, step_time=0.329 [2024-12-08 06:04:31,103] [INFO] [logging.py:129:log_dist] [Rank 0] step=472000, skipped=0, lr=[np.float64(7.13073277438458e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:04:31,104] [INFO] [timer.py:264:stop] epoch=0/micro_step=82000/global_step=82000, RunningAvgSamplesPerSec=49.2912047686429, CurrSamplesPerSec=51.71265271099195, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:04:31,107 (deepspeed_trainer:228) INFO: 32epoch:train:6901-7000batch: iter_time=1.114e-04, loss_ctc=79.904, loss_att=60.531, acc=0.730, loss=66.344, grad_norm=6.016, loss_scale=1.000, learning_rate=7.131e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 06:05:03,928 (deepspeed_trainer:228) INFO: 32epoch:train:7001-7100batch: iter_time=1.084e-04, loss_ctc=71.535, loss_att=53.317, acc=0.722, loss=58.767, grad_norm=5.492, loss_scale=1.000, learning_rate=7.130e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 06:05:36,560 (deepspeed_trainer:228) INFO: 32epoch:train:7101-7200batch: iter_time=1.067e-04, loss_ctc=67.791, loss_att=50.139, acc=0.718, loss=55.429, grad_norm=5.266, loss_scale=1.000, learning_rate=7.130e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 06:06:09,566 (deepspeed_trainer:228) INFO: 32epoch:train:7201-7300batch: iter_time=1.118e-04, loss_ctc=71.472, loss_att=56.719, acc=0.734, loss=61.124, grad_norm=5.238, loss_scale=1.000, learning_rate=7.129e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 06:06:42,360 (deepspeed_trainer:228) INFO: 32epoch:train:7301-7400batch: iter_time=1.107e-04, loss_ctc=67.323, loss_att=50.738, acc=0.725, loss=55.694, grad_norm=5.281, loss_scale=1.000, learning_rate=7.128e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 06:07:15,295 (deepspeed_trainer:228) INFO: 32epoch:train:7401-7500batch: iter_time=1.062e-04, loss_ctc=80.836, loss_att=62.112, acc=0.716, loss=67.750, grad_norm=7.023, loss_scale=1.000, learning_rate=7.127e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:07:19,273 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 06:07:46,566 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 06:08:04,226 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 06:08:04,226 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 06:08:04,229 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 06:08:27,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:28,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:28,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:28,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:30,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:30,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:30,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:30,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:31,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:31,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:31,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:31,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:31,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:33,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:32,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:08:42,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:16,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:18,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:19,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:19,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:19,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:19,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:20,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:20,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:20,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:20,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:21,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:21,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:21,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:21,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:22,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:09:30,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:05,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:06,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:08,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:08,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:09,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:10,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:10,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:10,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:10,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:11,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:11,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:12,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:12,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:12,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:14,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:18,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:54,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:54,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:57,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:57,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:57,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:58,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:59,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:59,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:10:59,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:00,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:01,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:02,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:02,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:03,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:05,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:11:07,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 06:12:09,015 (deepspeed_trainer:228) INFO: 32epoch:train:7501-7600batch: iter_time=2.537, loss_ctc=70.323, loss_att=57.924, acc=0.721, loss=61.593, grad_norm=5.138, loss_scale=1.000, learning_rate=7.127e-05, step_time=0.399 [cnode7-012:0/16] 2024-12-08 06:12:43,204 (deepspeed_trainer:228) INFO: 32epoch:train:7601-7700batch: iter_time=1.127e-04, loss_ctc=73.620, loss_att=59.900, acc=0.710, loss=63.977, grad_norm=6.359, loss_scale=1.000, learning_rate=7.126e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 06:13:16,098 (deepspeed_trainer:228) INFO: 32epoch:train:7701-7800batch: iter_time=1.080e-04, loss_ctc=60.923, loss_att=46.144, acc=0.726, loss=50.579, grad_norm=5.214, loss_scale=1.000, learning_rate=7.125e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:13:49,213 (deepspeed_trainer:228) INFO: 32epoch:train:7801-7900batch: iter_time=1.090e-04, loss_ctc=68.504, loss_att=51.461, acc=0.726, loss=56.572, grad_norm=5.169, loss_scale=1.000, learning_rate=7.124e-05, step_time=0.331 [2024-12-08 06:14:22,397] [INFO] [logging.py:129:log_dist] [Rank 0] step=473000, skipped=0, lr=[np.float64(7.123191029608207e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:14:22,400] [INFO] [timer.py:264:stop] epoch=0/micro_step=83000/global_step=83000, RunningAvgSamplesPerSec=49.285556117798606, CurrSamplesPerSec=48.21661898184971, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:14:22,403 (deepspeed_trainer:228) INFO: 32epoch:train:7901-8000batch: iter_time=1.095e-04, loss_ctc=66.884, loss_att=54.709, acc=0.722, loss=58.367, grad_norm=5.464, loss_scale=1.000, learning_rate=7.124e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:14:55,597 (deepspeed_trainer:228) INFO: 32epoch:train:8001-8100batch: iter_time=1.076e-04, loss_ctc=73.226, loss_att=60.129, acc=0.701, loss=64.037, grad_norm=5.759, loss_scale=1.000, learning_rate=7.123e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:15:28,572 (deepspeed_trainer:228) INFO: 32epoch:train:8101-8200batch: iter_time=1.085e-04, loss_ctc=63.241, loss_att=48.719, acc=0.714, loss=53.079, grad_norm=5.486, loss_scale=1.000, learning_rate=7.122e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:16:01,946 (deepspeed_trainer:228) INFO: 32epoch:train:8201-8300batch: iter_time=1.140e-04, loss_ctc=77.022, loss_att=61.097, acc=0.705, loss=65.873, grad_norm=5.812, loss_scale=1.000, learning_rate=7.121e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 06:16:35,237 (deepspeed_trainer:228) INFO: 32epoch:train:8301-8400batch: iter_time=1.110e-04, loss_ctc=69.257, loss_att=53.179, acc=0.713, loss=57.989, grad_norm=5.164, loss_scale=1.000, learning_rate=7.121e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:17:09,082 (deepspeed_trainer:228) INFO: 32epoch:train:8401-8500batch: iter_time=1.168e-04, loss_ctc=84.678, loss_att=57.622, acc=0.714, loss=65.757, grad_norm=5.878, loss_scale=1.000, learning_rate=7.120e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 06:17:42,520 (deepspeed_trainer:228) INFO: 32epoch:train:8501-8600batch: iter_time=1.080e-04, loss_ctc=73.716, loss_att=53.081, acc=0.705, loss=59.249, grad_norm=5.553, loss_scale=1.000, learning_rate=7.119e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:18:16,092 (deepspeed_trainer:228) INFO: 32epoch:train:8601-8700batch: iter_time=1.087e-04, loss_ctc=70.527, loss_att=55.061, acc=0.711, loss=59.683, grad_norm=5.595, loss_scale=1.000, learning_rate=7.118e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 06:18:49,780 (deepspeed_trainer:228) INFO: 32epoch:train:8701-8800batch: iter_time=1.063e-04, loss_ctc=74.839, loss_att=57.894, acc=0.712, loss=62.964, grad_norm=5.889, loss_scale=1.000, learning_rate=7.118e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 06:19:23,610 (deepspeed_trainer:228) INFO: 32epoch:train:8801-8900batch: iter_time=1.099e-04, loss_ctc=78.377, loss_att=59.663, acc=0.725, loss=65.287, grad_norm=5.734, loss_scale=1.000, learning_rate=7.117e-05, step_time=0.338 [2024-12-08 06:19:57,087] [INFO] [logging.py:129:log_dist] [Rank 0] step=474000, skipped=0, lr=[np.float64(7.115673163667809e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:19:57,088] [INFO] [timer.py:264:stop] epoch=0/micro_step=84000/global_step=84000, RunningAvgSamplesPerSec=49.28526228040433, CurrSamplesPerSec=48.774160027171554, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:19:57,090 (deepspeed_trainer:228) INFO: 32epoch:train:8901-9000batch: iter_time=1.094e-04, loss_ctc=65.500, loss_att=47.957, acc=0.723, loss=53.227, grad_norm=4.817, loss_scale=1.000, learning_rate=7.116e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:20:30,886 (deepspeed_trainer:228) INFO: 32epoch:train:9001-9100batch: iter_time=1.086e-04, loss_ctc=74.054, loss_att=56.373, acc=0.710, loss=61.658, grad_norm=4.981, loss_scale=1.000, learning_rate=7.115e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 06:21:04,536 (deepspeed_trainer:228) INFO: 32epoch:train:9101-9200batch: iter_time=1.113e-04, loss_ctc=67.998, loss_att=53.363, acc=0.730, loss=57.748, grad_norm=5.236, loss_scale=1.000, learning_rate=7.115e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 06:21:38,200 (deepspeed_trainer:228) INFO: 32epoch:train:9201-9300batch: iter_time=1.093e-04, loss_ctc=70.982, loss_att=53.093, acc=0.719, loss=58.463, grad_norm=4.918, loss_scale=1.000, learning_rate=7.114e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 06:22:08,085 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 06:22:34,797 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 06:22:50,774 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 06:22:50,774 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 06:22:50,776 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 06:23:19,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:19,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:19,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:19,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:19,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:20,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:20,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:20,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:20,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:20,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:20,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:21,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:21,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:22,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:22,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:23:23,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:08,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:08,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:09,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:09,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:10,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:10,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:10,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:10,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:10,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:10,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:11,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:11,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:11,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:11,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:12,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:13,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:58,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:58,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:58,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:59,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:59,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:59,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:24:59,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:00,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:00,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:00,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:00,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:00,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:01,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:01,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:02,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:04,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:47,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:48,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:48,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:49,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:49,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:50,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:51,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:52,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:25:54,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 06:26:30,122 (deepspeed_trainer:228) INFO: 32epoch:train:9301-9400batch: iter_time=2.533, loss_ctc=76.004, loss_att=58.692, acc=0.716, loss=63.911, grad_norm=6.447, loss_scale=1.000, learning_rate=7.113e-05, step_time=0.385 [cnode7-012:0/16] 2024-12-08 06:27:03,637 (deepspeed_trainer:228) INFO: 32epoch:train:9401-9500batch: iter_time=1.141e-04, loss_ctc=71.790, loss_att=56.321, acc=0.722, loss=60.970, grad_norm=4.879, loss_scale=1.000, learning_rate=7.112e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 06:27:37,476 (deepspeed_trainer:228) INFO: 32epoch:train:9501-9600batch: iter_time=1.119e-04, loss_ctc=73.593, loss_att=57.839, acc=0.715, loss=62.549, grad_norm=6.476, loss_scale=1.000, learning_rate=7.112e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 06:28:11,511 (deepspeed_trainer:228) INFO: 32epoch:train:9601-9700batch: iter_time=1.115e-04, loss_ctc=61.718, loss_att=45.414, acc=0.730, loss=50.287, grad_norm=4.997, loss_scale=1.000, learning_rate=7.111e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 06:28:44,914 (deepspeed_trainer:228) INFO: 32epoch:train:9701-9800batch: iter_time=1.117e-04, loss_ctc=64.468, loss_att=48.777, acc=0.729, loss=53.468, grad_norm=5.036, loss_scale=1.000, learning_rate=7.110e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:29:18,510 (deepspeed_trainer:228) INFO: 32epoch:train:9801-9900batch: iter_time=1.096e-04, loss_ctc=71.206, loss_att=58.594, acc=0.713, loss=62.389, grad_norm=5.379, loss_scale=1.000, learning_rate=7.109e-05, step_time=0.335 [2024-12-08 06:29:51,743] [INFO] [logging.py:129:log_dist] [Rank 0] step=475000, skipped=0, lr=[np.float64(7.108179050819309e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:29:51,744] [INFO] [timer.py:264:stop] epoch=0/micro_step=85000/global_step=85000, RunningAvgSamplesPerSec=49.27313913984831, CurrSamplesPerSec=47.49610015139969, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:29:51,746 (deepspeed_trainer:228) INFO: 32epoch:train:9901-10000batch: iter_time=1.098e-04, loss_ctc=69.282, loss_att=56.507, acc=0.709, loss=60.360, grad_norm=5.780, loss_scale=1.000, learning_rate=7.109e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:30:26,246 (deepspeed_trainer:228) INFO: 32epoch:train:10001-10100batch: iter_time=1.110e-04, loss_ctc=65.829, loss_att=53.747, acc=0.701, loss=57.381, grad_norm=5.667, loss_scale=1.000, learning_rate=7.108e-05, step_time=0.345 [cnode7-012:0/16] 2024-12-08 06:30:59,864 (deepspeed_trainer:228) INFO: 32epoch:train:10101-10200batch: iter_time=1.125e-04, loss_ctc=77.351, loss_att=59.155, acc=0.715, loss=64.625, grad_norm=5.948, loss_scale=1.000, learning_rate=7.107e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 06:31:32,963 (deepspeed_trainer:228) INFO: 32epoch:train:10201-10300batch: iter_time=1.109e-04, loss_ctc=68.942, loss_att=52.590, acc=0.705, loss=57.504, grad_norm=5.296, loss_scale=1.000, learning_rate=7.106e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:32:06,242 (deepspeed_trainer:228) INFO: 32epoch:train:10301-10400batch: iter_time=1.111e-04, loss_ctc=85.469, loss_att=57.814, acc=0.717, loss=66.125, grad_norm=6.176, loss_scale=1.000, learning_rate=7.106e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:32:39,449 (deepspeed_trainer:228) INFO: 32epoch:train:10401-10500batch: iter_time=1.100e-04, loss_ctc=73.891, loss_att=54.850, acc=0.707, loss=60.564, grad_norm=5.853, loss_scale=1.000, learning_rate=7.105e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:33:12,449 (deepspeed_trainer:228) INFO: 32epoch:train:10501-10600batch: iter_time=1.129e-04, loss_ctc=67.187, loss_att=50.123, acc=0.717, loss=55.220, grad_norm=6.165, loss_scale=1.000, learning_rate=7.104e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 06:33:46,268 (deepspeed_trainer:228) INFO: 32epoch:train:10601-10700batch: iter_time=1.102e-04, loss_ctc=84.272, loss_att=66.074, acc=0.707, loss=71.546, grad_norm=6.253, loss_scale=1.000, learning_rate=7.103e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 06:34:19,516 (deepspeed_trainer:228) INFO: 32epoch:train:10701-10800batch: iter_time=1.098e-04, loss_ctc=70.237, loss_att=54.112, acc=0.727, loss=58.956, grad_norm=5.364, loss_scale=1.000, learning_rate=7.103e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:34:52,599 (deepspeed_trainer:228) INFO: 32epoch:train:10801-10900batch: iter_time=1.151e-04, loss_ctc=66.857, loss_att=48.264, acc=0.721, loss=53.838, grad_norm=5.143, loss_scale=1.000, learning_rate=7.102e-05, step_time=0.330 [2024-12-08 06:35:26,027] [INFO] [logging.py:129:log_dist] [Rank 0] step=476000, skipped=0, lr=[np.float64(7.100708566243695e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:35:26,027] [INFO] [timer.py:264:stop] epoch=0/micro_step=86000/global_step=86000, RunningAvgSamplesPerSec=49.27416843873626, CurrSamplesPerSec=51.22430081330821, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:35:26,030 (deepspeed_trainer:228) INFO: 32epoch:train:10901-11000batch: iter_time=1.132e-04, loss_ctc=74.228, loss_att=59.642, acc=0.710, loss=64.012, grad_norm=5.136, loss_scale=1.000, learning_rate=7.101e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:35:59,296 (deepspeed_trainer:228) INFO: 32epoch:train:11001-11100batch: iter_time=1.100e-04, loss_ctc=64.953, loss_att=47.412, acc=0.730, loss=52.665, grad_norm=5.512, loss_scale=1.000, learning_rate=7.100e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:36:32,445 (deepspeed_trainer:228) INFO: 32epoch:train:11101-11200batch: iter_time=1.092e-04, loss_ctc=74.059, loss_att=55.155, acc=0.722, loss=60.811, grad_norm=5.621, loss_scale=1.000, learning_rate=7.100e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:36:53,243 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 06:37:20,611 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 06:37:36,820 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 06:37:36,820 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 06:37:36,822 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 06:38:02,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:02,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:03,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:05,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:04,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:05,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:05,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:06,084] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:06,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:06,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:51,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:53,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:52,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:53,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:54,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:54,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:54,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:54,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:55,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:55,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:56,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:56,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:55,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:56,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:56,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:38:58,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:40,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:41,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:42,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:41,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:42,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:43,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:44,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:45,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:45,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:45,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:45,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:45,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:46,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:46,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:48,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:39:48,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:27,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:28,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:29,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:29,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:30,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:31,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:33,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:34,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:34,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:34,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:35,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:35,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:35,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:36,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:37,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:40:39,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 06:41:23,644 (deepspeed_trainer:228) INFO: 32epoch:train:11201-11300batch: iter_time=2.535, loss_ctc=73.194, loss_att=57.524, acc=0.723, loss=62.212, grad_norm=6.244, loss_scale=1.000, learning_rate=7.099e-05, step_time=0.380 [cnode7-012:0/16] 2024-12-08 06:41:57,682 (deepspeed_trainer:228) INFO: 32epoch:train:11301-11400batch: iter_time=1.108e-04, loss_ctc=74.403, loss_att=58.615, acc=0.720, loss=63.362, grad_norm=5.785, loss_scale=1.000, learning_rate=7.098e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 06:42:31,641 (deepspeed_trainer:228) INFO: 32epoch:train:11401-11500batch: iter_time=1.102e-04, loss_ctc=67.213, loss_att=51.264, acc=0.718, loss=56.044, grad_norm=5.924, loss_scale=1.000, learning_rate=7.097e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 06:43:04,776 (deepspeed_trainer:228) INFO: 32epoch:train:11501-11600batch: iter_time=1.100e-04, loss_ctc=63.496, loss_att=48.767, acc=0.726, loss=53.172, grad_norm=5.266, loss_scale=1.000, learning_rate=7.097e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 06:43:38,265 (deepspeed_trainer:228) INFO: 32epoch:train:11601-11700batch: iter_time=1.082e-04, loss_ctc=64.616, loss_att=49.314, acc=0.733, loss=53.908, grad_norm=4.685, loss_scale=1.000, learning_rate=7.096e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 06:44:12,100 (deepspeed_trainer:228) INFO: 32epoch:train:11701-11800batch: iter_time=1.109e-04, loss_ctc=70.334, loss_att=57.234, acc=0.711, loss=61.159, grad_norm=5.105, loss_scale=1.000, learning_rate=7.095e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 06:44:45,898 (deepspeed_trainer:228) INFO: 32epoch:train:11801-11900batch: iter_time=1.111e-04, loss_ctc=71.368, loss_att=59.062, acc=0.700, loss=62.766, grad_norm=6.310, loss_scale=1.000, learning_rate=7.094e-05, step_time=0.338 [2024-12-08 06:45:19,118] [INFO] [logging.py:129:log_dist] [Rank 0] step=477000, skipped=0, lr=[np.float64(7.093261586038303e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:45:19,119] [INFO] [timer.py:264:stop] epoch=0/micro_step=87000/global_step=87000, RunningAvgSamplesPerSec=49.26495446279831, CurrSamplesPerSec=47.95541426801267, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:45:19,121 (deepspeed_trainer:228) INFO: 32epoch:train:11901-12000batch: iter_time=1.092e-04, loss_ctc=68.309, loss_att=54.068, acc=0.710, loss=58.364, grad_norm=5.725, loss_scale=1.000, learning_rate=7.094e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:45:52,367 (deepspeed_trainer:228) INFO: 32epoch:train:12001-12100batch: iter_time=1.130e-04, loss_ctc=69.964, loss_att=52.133, acc=0.721, loss=57.483, grad_norm=5.412, loss_scale=1.000, learning_rate=7.093e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 06:46:25,130 (deepspeed_trainer:228) INFO: 32epoch:train:12101-12200batch: iter_time=1.093e-04, loss_ctc=72.431, loss_att=55.611, acc=0.704, loss=60.646, grad_norm=6.805, loss_scale=1.000, learning_rate=7.092e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 06:46:58,109 (deepspeed_trainer:228) INFO: 32epoch:train:12201-12300batch: iter_time=1.119e-04, loss_ctc=83.657, loss_att=57.784, acc=0.712, loss=65.565, grad_norm=6.111, loss_scale=1.000, learning_rate=7.091e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 06:47:30,801 (deepspeed_trainer:228) INFO: 32epoch:train:12301-12400batch: iter_time=1.096e-04, loss_ctc=72.149, loss_att=53.044, acc=0.709, loss=58.788, grad_norm=6.042, loss_scale=1.000, learning_rate=7.091e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 06:48:03,436 (deepspeed_trainer:228) INFO: 32epoch:train:12401-12500batch: iter_time=1.111e-04, loss_ctc=69.584, loss_att=51.154, acc=0.719, loss=56.677, grad_norm=5.625, loss_scale=1.000, learning_rate=7.090e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 06:48:36,349 (deepspeed_trainer:228) INFO: 32epoch:train:12501-12600batch: iter_time=1.089e-04, loss_ctc=82.307, loss_att=66.120, acc=0.714, loss=70.963, grad_norm=6.339, loss_scale=1.000, learning_rate=7.089e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 06:49:09,174 (deepspeed_trainer:228) INFO: 32epoch:train:12601-12700batch: iter_time=1.144e-04, loss_ctc=72.653, loss_att=54.729, acc=0.725, loss=60.084, grad_norm=5.296, loss_scale=1.000, learning_rate=7.088e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 06:49:41,740 (deepspeed_trainer:228) INFO: 32epoch:train:12701-12800batch: iter_time=1.123e-04, loss_ctc=65.441, loss_att=46.002, acc=0.719, loss=51.833, grad_norm=5.500, loss_scale=1.000, learning_rate=7.088e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 06:50:14,680 (deepspeed_trainer:228) INFO: 32epoch:train:12801-12900batch: iter_time=1.096e-04, loss_ctc=73.052, loss_att=61.397, acc=0.715, loss=64.927, grad_norm=5.065, loss_scale=1.000, learning_rate=7.087e-05, step_time=0.329 [2024-12-08 06:50:47,183] [INFO] [logging.py:129:log_dist] [Rank 0] step=478000, skipped=0, lr=[np.float64(7.085837987208174e-05)], mom=[[0.9, 0.98]] [2024-12-08 06:50:47,183] [INFO] [timer.py:264:stop] epoch=0/micro_step=88000/global_step=88000, RunningAvgSamplesPerSec=49.276135880130425, CurrSamplesPerSec=52.52160755557209, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 06:50:47,186 (deepspeed_trainer:228) INFO: 32epoch:train:12901-13000batch: iter_time=1.113e-04, loss_ctc=66.373, loss_att=45.868, acc=0.734, loss=52.017, grad_norm=4.945, loss_scale=1.000, learning_rate=7.086e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 06:51:19,964 (deepspeed_trainer:228) INFO: 32epoch:train:13001-13100batch: iter_time=1.088e-04, loss_ctc=79.688, loss_att=58.786, acc=0.713, loss=65.068, grad_norm=7.282, loss_scale=1.000, learning_rate=7.085e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 06:51:31,935 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 06:51:59,131 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 06:52:16,649 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 06:52:16,649 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 06:52:16,651 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 06:52:41,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:41,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:42,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:42,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:43,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:42,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:42,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:43,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:43,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:43,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:44,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:43,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:44,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:44,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:43,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:52:44,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:30,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:30,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:32,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:31,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:32,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:32,751] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:32,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:33,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:32,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:33,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:32,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:33,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:33,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:33,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:33,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:53:37,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:19,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:19,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:20,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:20,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:21,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:21,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:21,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:21,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:22,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:22,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:23,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:23,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:23,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:24,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:23,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:54:29,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:06,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:07,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:08,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:08,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:10,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:09,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:10,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:10,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:10,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:11,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:11,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:11,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:12,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:11,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:12,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 06:55:19,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 06:56:11,314 (deepspeed_trainer:228) INFO: 32epoch:train:13101-13200batch: iter_time=2.511, loss_ctc=72.138, loss_att=63.158, acc=0.723, loss=65.836, grad_norm=5.132, loss_scale=1.000, learning_rate=7.085e-05, step_time=0.402 [cnode7-012:0/16] 2024-12-08 06:56:44,719 (deepspeed_trainer:228) INFO: 32epoch:train:13201-13300batch: iter_time=1.213e-04, loss_ctc=69.173, loss_att=55.713, acc=0.726, loss=59.751, grad_norm=5.328, loss_scale=1.000, learning_rate=7.084e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:57:17,743 (deepspeed_trainer:228) INFO: 32epoch:train:13301-13400batch: iter_time=1.144e-04, loss_ctc=63.530, loss_att=48.263, acc=0.728, loss=52.832, grad_norm=6.435, loss_scale=1.000, learning_rate=7.083e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 06:57:51,371 (deepspeed_trainer:228) INFO: 32epoch:train:13401-13500batch: iter_time=1.159e-04, loss_ctc=66.519, loss_att=48.458, acc=0.735, loss=53.855, grad_norm=5.257, loss_scale=1.000, learning_rate=7.083e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 06:58:25,366 (deepspeed_trainer:228) INFO: 32epoch:train:13501-13600batch: iter_time=1.101e-04, loss_ctc=66.661, loss_att=56.472, acc=0.728, loss=59.528, grad_norm=5.150, loss_scale=1.000, learning_rate=7.082e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 06:58:58,744 (deepspeed_trainer:228) INFO: 32epoch:train:13601-13700batch: iter_time=1.073e-04, loss_ctc=67.635, loss_att=54.620, acc=0.719, loss=58.521, grad_norm=5.093, loss_scale=1.000, learning_rate=7.081e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 06:59:32,172 (deepspeed_trainer:228) INFO: 32epoch:train:13701-13800batch: iter_time=1.099e-04, loss_ctc=69.674, loss_att=57.938, acc=0.703, loss=61.454, grad_norm=5.665, loss_scale=1.000, learning_rate=7.080e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 07:00:05,698 (deepspeed_trainer:228) INFO: 32epoch:train:13801-13900batch: iter_time=1.086e-04, loss_ctc=72.477, loss_att=59.714, acc=0.716, loss=63.507, grad_norm=5.538, loss_scale=1.000, learning_rate=7.080e-05, step_time=0.335 [2024-12-08 07:00:39,084] [INFO] [logging.py:129:log_dist] [Rank 0] step=479000, skipped=0, lr=[np.float64(7.078437647657533e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:00:39,085] [INFO] [timer.py:264:stop] epoch=0/micro_step=89000/global_step=89000, RunningAvgSamplesPerSec=49.26530604055269, CurrSamplesPerSec=49.30925048734433, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:00:39,087 (deepspeed_trainer:228) INFO: 32epoch:train:13901-14000batch: iter_time=1.074e-04, loss_ctc=67.771, loss_att=50.017, acc=0.724, loss=55.348, grad_norm=5.038, loss_scale=1.000, learning_rate=7.079e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 07:01:12,714 (deepspeed_trainer:228) INFO: 32epoch:train:14001-14100batch: iter_time=1.114e-04, loss_ctc=79.864, loss_att=57.892, acc=0.709, loss=64.486, grad_norm=5.925, loss_scale=1.000, learning_rate=7.078e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 07:01:46,042 (deepspeed_trainer:228) INFO: 32epoch:train:14101-14200batch: iter_time=1.073e-04, loss_ctc=79.498, loss_att=56.545, acc=0.721, loss=63.462, grad_norm=5.489, loss_scale=1.000, learning_rate=7.077e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:02:19,048 (deepspeed_trainer:228) INFO: 32epoch:train:14201-14300batch: iter_time=1.094e-04, loss_ctc=70.301, loss_att=52.615, acc=0.713, loss=57.933, grad_norm=6.037, loss_scale=1.000, learning_rate=7.077e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 07:02:52,213 (deepspeed_trainer:228) INFO: 32epoch:train:14301-14400batch: iter_time=1.065e-04, loss_ctc=70.498, loss_att=57.134, acc=0.715, loss=61.165, grad_norm=5.590, loss_scale=1.000, learning_rate=7.076e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:03:25,912 (deepspeed_trainer:228) INFO: 32epoch:train:14401-14500batch: iter_time=1.094e-04, loss_ctc=78.614, loss_att=60.398, acc=0.730, loss=65.847, grad_norm=5.471, loss_scale=1.000, learning_rate=7.075e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 07:03:59,274 (deepspeed_trainer:228) INFO: 32epoch:train:14501-14600batch: iter_time=1.106e-04, loss_ctc=70.699, loss_att=52.472, acc=0.724, loss=57.969, grad_norm=5.011, loss_scale=1.000, learning_rate=7.074e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:04:32,428 (deepspeed_trainer:228) INFO: 32epoch:train:14601-14700batch: iter_time=1.100e-04, loss_ctc=67.369, loss_att=50.081, acc=0.719, loss=55.270, grad_norm=5.396, loss_scale=1.000, learning_rate=7.074e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:05:05,925 (deepspeed_trainer:228) INFO: 32epoch:train:14701-14800batch: iter_time=1.113e-04, loss_ctc=70.900, loss_att=56.547, acc=0.736, loss=60.868, grad_norm=4.998, loss_scale=1.000, learning_rate=7.073e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 07:05:39,735 (deepspeed_trainer:228) INFO: 32epoch:train:14801-14900batch: iter_time=1.119e-04, loss_ctc=66.968, loss_att=50.660, acc=0.726, loss=55.547, grad_norm=5.172, loss_scale=1.000, learning_rate=7.072e-05, step_time=0.338 [2024-12-08 07:06:13,232] [INFO] [logging.py:129:log_dist] [Rank 0] step=480000, skipped=0, lr=[np.float64(7.071060446181347e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:06:13,233] [INFO] [timer.py:264:stop] epoch=0/micro_step=90000/global_step=90000, RunningAvgSamplesPerSec=49.266475450571974, CurrSamplesPerSec=48.478583442173495, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:06:13,235 (deepspeed_trainer:228) INFO: 32epoch:train:14901-15000batch: iter_time=1.077e-04, loss_ctc=79.347, loss_att=61.259, acc=0.721, loss=66.682, grad_norm=6.543, loss_scale=1.000, learning_rate=7.071e-05, step_time=0.335 [2024-12-08 07:06:28,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:28,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:28,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:28,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:28,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,000] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:29,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:43,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:44,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:43,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:44,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:44,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:46,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:46,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:45,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:46,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:46,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:59,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:00,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:06:59,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:00,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:00,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:00,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:00,901] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:01,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:01,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:01,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:01,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:01,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:02,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:02,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:02,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:02,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:15,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:14,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:15,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:16,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:16,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:17,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:17,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:16,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:16,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:17,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:17,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:18,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:17,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:18,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:18,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:18,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:07:31,423] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 32 is about to be saved! [2024-12-08 07:07:31,452] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/mp_rank_00_model_states.pt [2024-12-08 07:07:31,452] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/mp_rank_00_model_states.pt... [2024-12-08 07:07:33,461] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/mp_rank_00_model_states.pt. [2024-12-08 07:07:32,856] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,857] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,622] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,859] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,859] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,625] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,625] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,626] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,863] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,626] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,627] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,864] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,628] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 07:07:33,628] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,866] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 07:07:32,866] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 07:07:34,313] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,313] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,313] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,323] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,323] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,323] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,397] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,397] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,397] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,646] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,647] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,647] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,420] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,420] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,420] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,663] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,663] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,663] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,435] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,435] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,436] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,437] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,444] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,445] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,445] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,448] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,448] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,687] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,687] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,687] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:34,451] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 07:07:34,452] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 07:07:34,452] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,707] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,708] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,708] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,709] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,709] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,709] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,710] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,710] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,710] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,710] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,710] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,710] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [2024-12-08 07:07:33,721] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 07:07:33,721] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_32/32/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 07:07:33,721] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 32 is ready now! [cnode7-012:0/16] 2024-12-08 07:07:34,492 (deepspeed_trainer:158) INFO: 32epoch results: [train] iter_time=0.135, loss_ctc=71.929, loss_att=55.282, acc=0.717, loss=60.276, grad_norm=5.813, loss_scale=1.000, learning_rate=7.127e-05, step_time=0.335, time=1 hour, 57 minutes and 42.06 seconds, total_count=480032, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.766, cer_ctc=0.103, loss_att=5.969, acc=0.887, cer=0.109, wer=0.938, loss=5.312, time=1 minute and 9.01 seconds, total_count=32, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 07:07:35,718 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 07:08:03,048 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 07:08:18,698 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 07:08:18,698 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 07:08:18,700 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 07:08:39,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:39,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:40,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:40,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:40,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:42,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:41,742] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:42,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:42,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:08:42,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:26,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:26,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:28,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:29,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:28,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:28,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:29,836] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:30,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:30,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:30,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:30,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:31,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:31,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:32,592] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:32,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:09:33,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:14,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:15,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:16,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:17,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:18,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:17,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:18,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:17,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:18,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:19,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:18,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:19,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:21,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:21,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:21,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:10:21,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:03,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:03,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:04,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:06,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:07,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:06,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:06,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:07,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:07,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:08,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:08,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:10,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:10,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:11,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:11,349] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:11:13,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 07:12:13,249 (deepspeed_trainer:228) INFO: 33epoch:train:1-100batch: iter_time=2.418, loss_ctc=73.821, loss_att=52.030, acc=0.724, loss=58.594, grad_norm=5.194, loss_scale=1.000, learning_rate=7.071e-05, step_time=0.357 [cnode7-012:0/16] 2024-12-08 07:12:46,581 (deepspeed_trainer:228) INFO: 33epoch:train:101-200batch: iter_time=1.100e-04, loss_ctc=66.624, loss_att=49.774, acc=0.731, loss=54.825, grad_norm=4.960, loss_scale=1.000, learning_rate=7.070e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:13:19,728 (deepspeed_trainer:228) INFO: 33epoch:train:201-300batch: iter_time=1.073e-04, loss_ctc=65.999, loss_att=53.385, acc=0.722, loss=57.147, grad_norm=5.283, loss_scale=1.000, learning_rate=7.069e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:13:52,889 (deepspeed_trainer:228) INFO: 33epoch:train:301-400batch: iter_time=1.064e-04, loss_ctc=71.093, loss_att=50.078, acc=0.721, loss=56.423, grad_norm=6.797, loss_scale=1.000, learning_rate=7.068e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:14:26,524 (deepspeed_trainer:228) INFO: 33epoch:train:401-500batch: iter_time=1.118e-04, loss_ctc=75.666, loss_att=55.480, acc=0.711, loss=61.556, grad_norm=5.559, loss_scale=1.000, learning_rate=7.068e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 07:15:00,326 (deepspeed_trainer:228) INFO: 33epoch:train:501-600batch: iter_time=1.157e-04, loss_ctc=76.484, loss_att=60.811, acc=0.713, loss=65.520, grad_norm=6.517, loss_scale=1.000, learning_rate=7.067e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 07:15:33,752 (deepspeed_trainer:228) INFO: 33epoch:train:601-700batch: iter_time=1.191e-04, loss_ctc=74.421, loss_att=51.812, acc=0.730, loss=58.591, grad_norm=6.269, loss_scale=1.000, learning_rate=7.066e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 07:16:07,120 (deepspeed_trainer:228) INFO: 33epoch:train:701-800batch: iter_time=1.147e-04, loss_ctc=70.855, loss_att=52.097, acc=0.724, loss=57.708, grad_norm=5.632, loss_scale=1.000, learning_rate=7.066e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:16:41,414 (deepspeed_trainer:228) INFO: 33epoch:train:801-900batch: iter_time=1.152e-04, loss_ctc=86.520, loss_att=66.372, acc=0.700, loss=72.420, grad_norm=6.879, loss_scale=1.000, learning_rate=7.065e-05, step_time=0.342 [2024-12-08 07:17:14,902] [INFO] [logging.py:129:log_dist] [Rank 0] step=481000, skipped=0, lr=[np.float64(7.06370626245699e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:17:14,902] [INFO] [timer.py:264:stop] epoch=0/micro_step=91000/global_step=91000, RunningAvgSamplesPerSec=49.26281712025757, CurrSamplesPerSec=49.1042075208472, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:17:14,905 (deepspeed_trainer:228) INFO: 33epoch:train:901-1000batch: iter_time=1.198e-04, loss_ctc=80.509, loss_att=63.942, acc=0.710, loss=68.907, grad_norm=5.581, loss_scale=1.000, learning_rate=7.064e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 07:17:48,078 (deepspeed_trainer:228) INFO: 33epoch:train:1001-1100batch: iter_time=1.127e-04, loss_ctc=67.510, loss_att=49.761, acc=0.732, loss=55.061, grad_norm=4.890, loss_scale=1.000, learning_rate=7.063e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:18:21,367 (deepspeed_trainer:228) INFO: 33epoch:train:1101-1200batch: iter_time=1.249e-04, loss_ctc=75.025, loss_att=54.438, acc=0.726, loss=60.614, grad_norm=5.975, loss_scale=1.000, learning_rate=7.063e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:18:55,072 (deepspeed_trainer:228) INFO: 33epoch:train:1201-1300batch: iter_time=1.193e-04, loss_ctc=73.955, loss_att=56.377, acc=0.723, loss=61.648, grad_norm=5.351, loss_scale=1.000, learning_rate=7.062e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 07:19:28,032 (deepspeed_trainer:228) INFO: 33epoch:train:1301-1400batch: iter_time=1.098e-04, loss_ctc=67.856, loss_att=51.851, acc=0.730, loss=56.646, grad_norm=5.636, loss_scale=1.000, learning_rate=7.061e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 07:20:01,222 (deepspeed_trainer:228) INFO: 33epoch:train:1401-1500batch: iter_time=1.110e-04, loss_ctc=72.466, loss_att=51.590, acc=0.730, loss=57.852, grad_norm=5.638, loss_scale=1.000, learning_rate=7.060e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:20:34,427 (deepspeed_trainer:228) INFO: 33epoch:train:1501-1600batch: iter_time=1.133e-04, loss_ctc=76.257, loss_att=57.534, acc=0.730, loss=63.166, grad_norm=6.382, loss_scale=1.000, learning_rate=7.060e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:21:07,686 (deepspeed_trainer:228) INFO: 33epoch:train:1601-1700batch: iter_time=1.182e-04, loss_ctc=67.307, loss_att=47.328, acc=0.735, loss=53.336, grad_norm=4.971, loss_scale=1.000, learning_rate=7.059e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:21:41,107 (deepspeed_trainer:228) INFO: 33epoch:train:1701-1800batch: iter_time=1.188e-04, loss_ctc=73.057, loss_att=55.718, acc=0.721, loss=60.894, grad_norm=5.621, loss_scale=1.000, learning_rate=7.058e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 07:22:11,312 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 07:22:38,304 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 07:22:56,045 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 07:22:56,045 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 07:22:56,047 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 07:23:20,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:20,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:20,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:21,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:22,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:23,645] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:24,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:25,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:23:26,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:10,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:11,338] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:11,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:12,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:13,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:13,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:14,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:16,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:18,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:18,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:19,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:19,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:20,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:20,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:20,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:20,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:59,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:24:59,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:00,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:01,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:01,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:04,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:05,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:05,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:07,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:07,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:08,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:09,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:09,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:09,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:09,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:10,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:49,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:49,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:50,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:51,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:55,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:54,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:55,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:56,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:57,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:57,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:58,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:58,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:57,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:58,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:25:59,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:26:01,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 07:26:36,589 (deepspeed_trainer:228) INFO: 33epoch:train:1801-1900batch: iter_time=2.601, loss_ctc=73.091, loss_att=50.561, acc=0.736, loss=57.318, grad_norm=5.262, loss_scale=1.000, learning_rate=7.057e-05, step_time=0.353 [2024-12-08 07:27:09,518] [INFO] [logging.py:129:log_dist] [Rank 0] step=482000, skipped=0, lr=[np.float64(7.056374977036e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:27:09,518] [INFO] [timer.py:264:stop] epoch=0/micro_step=92000/global_step=92000, RunningAvgSamplesPerSec=49.26393265496847, CurrSamplesPerSec=52.53990571567069, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:27:09,521 (deepspeed_trainer:228) INFO: 33epoch:train:1901-2000batch: iter_time=1.143e-04, loss_ctc=72.291, loss_att=53.056, acc=0.716, loss=58.830, grad_norm=5.611, loss_scale=1.000, learning_rate=7.057e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 07:27:42,639 (deepspeed_trainer:228) INFO: 33epoch:train:2001-2100batch: iter_time=1.108e-04, loss_ctc=64.027, loss_att=48.615, acc=0.730, loss=53.274, grad_norm=5.038, loss_scale=1.000, learning_rate=7.056e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:28:15,665 (deepspeed_trainer:228) INFO: 33epoch:train:2101-2200batch: iter_time=1.137e-04, loss_ctc=70.808, loss_att=59.947, acc=0.710, loss=63.218, grad_norm=7.312, loss_scale=1.000, learning_rate=7.055e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 07:28:48,818 (deepspeed_trainer:228) INFO: 33epoch:train:2201-2300batch: iter_time=1.142e-04, loss_ctc=71.533, loss_att=50.667, acc=0.723, loss=56.926, grad_norm=5.657, loss_scale=1.000, learning_rate=7.055e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:29:21,909 (deepspeed_trainer:228) INFO: 33epoch:train:2301-2400batch: iter_time=1.133e-04, loss_ctc=75.269, loss_att=55.685, acc=0.713, loss=61.590, grad_norm=5.609, loss_scale=1.000, learning_rate=7.054e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:29:55,198 (deepspeed_trainer:228) INFO: 33epoch:train:2401-2500batch: iter_time=1.132e-04, loss_ctc=73.147, loss_att=57.999, acc=0.716, loss=62.546, grad_norm=6.242, loss_scale=1.000, learning_rate=7.053e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:30:28,491 (deepspeed_trainer:228) INFO: 33epoch:train:2501-2600batch: iter_time=1.091e-04, loss_ctc=72.343, loss_att=50.446, acc=0.725, loss=57.011, grad_norm=5.798, loss_scale=1.000, learning_rate=7.052e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:31:01,577 (deepspeed_trainer:228) INFO: 33epoch:train:2601-2700batch: iter_time=1.072e-04, loss_ctc=69.253, loss_att=52.381, acc=0.717, loss=57.444, grad_norm=5.346, loss_scale=1.000, learning_rate=7.052e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:31:34,950 (deepspeed_trainer:228) INFO: 33epoch:train:2701-2800batch: iter_time=1.097e-04, loss_ctc=88.509, loss_att=69.238, acc=0.694, loss=75.001, grad_norm=7.916, loss_scale=1.000, learning_rate=7.051e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:32:08,116 (deepspeed_trainer:228) INFO: 33epoch:train:2801-2900batch: iter_time=1.106e-04, loss_ctc=78.937, loss_att=56.615, acc=0.727, loss=63.328, grad_norm=5.322, loss_scale=1.000, learning_rate=7.050e-05, step_time=0.331 [2024-12-08 07:32:40,911] [INFO] [logging.py:129:log_dist] [Rank 0] step=483000, skipped=0, lr=[np.float64(7.04906647133593e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:32:40,912] [INFO] [timer.py:264:stop] epoch=0/micro_step=93000/global_step=93000, RunningAvgSamplesPerSec=49.26924943134224, CurrSamplesPerSec=52.535833786400225, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:32:40,914 (deepspeed_trainer:228) INFO: 33epoch:train:2901-3000batch: iter_time=1.094e-04, loss_ctc=65.749, loss_att=53.078, acc=0.706, loss=56.862, grad_norm=4.989, loss_scale=1.000, learning_rate=7.049e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 07:33:13,943 (deepspeed_trainer:228) INFO: 33epoch:train:3001-3100batch: iter_time=1.083e-04, loss_ctc=76.561, loss_att=53.029, acc=0.727, loss=60.096, grad_norm=5.297, loss_scale=1.000, learning_rate=7.049e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 07:33:46,869 (deepspeed_trainer:228) INFO: 33epoch:train:3101-3200batch: iter_time=1.090e-04, loss_ctc=73.259, loss_att=54.474, acc=0.724, loss=60.112, grad_norm=5.146, loss_scale=1.000, learning_rate=7.048e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 07:34:19,701 (deepspeed_trainer:228) INFO: 33epoch:train:3201-3300batch: iter_time=1.075e-04, loss_ctc=65.859, loss_att=50.639, acc=0.730, loss=55.208, grad_norm=4.684, loss_scale=1.000, learning_rate=7.047e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 07:34:52,397 (deepspeed_trainer:228) INFO: 33epoch:train:3301-3400batch: iter_time=1.083e-04, loss_ctc=71.339, loss_att=48.822, acc=0.731, loss=55.572, grad_norm=5.118, loss_scale=1.000, learning_rate=7.047e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 07:35:25,166 (deepspeed_trainer:228) INFO: 33epoch:train:3401-3500batch: iter_time=1.065e-04, loss_ctc=72.463, loss_att=55.008, acc=0.730, loss=60.256, grad_norm=6.176, loss_scale=1.000, learning_rate=7.046e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 07:35:58,105 (deepspeed_trainer:228) INFO: 33epoch:train:3501-3600batch: iter_time=1.086e-04, loss_ctc=71.399, loss_att=49.530, acc=0.727, loss=56.075, grad_norm=5.652, loss_scale=1.000, learning_rate=7.045e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 07:36:30,955 (deepspeed_trainer:228) INFO: 33epoch:train:3601-3700batch: iter_time=1.082e-04, loss_ctc=74.323, loss_att=57.343, acc=0.718, loss=62.445, grad_norm=6.082, loss_scale=1.000, learning_rate=7.044e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 07:36:51,325 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 07:37:17,468 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 07:37:33,230 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 07:37:33,230 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 07:37:33,232 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 07:37:59,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:00,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:00,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:01,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:01,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:02,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:02,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:01,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:01,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:02,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:01,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:03,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:02,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:02,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:03,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:05,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:48,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:48,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:49,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:50,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:51,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:50,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:51,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:51,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:52,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:51,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:52,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:52,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:52,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:53,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:38:55,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:01,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:35,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:37,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:37,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:39,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:39,581] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:38,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:41,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:40,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:41,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:40,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:41,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:41,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:41,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:45,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:45,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:39:58,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:23,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:26,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:26,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:27,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:26,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:27,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:28,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:29,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:29,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:31,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:30,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:31,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:31,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:33,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:35,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:40:49,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 07:41:29,740 (deepspeed_trainer:228) INFO: 33epoch:train:3701-3800batch: iter_time=2.487, loss_ctc=67.638, loss_att=48.992, acc=0.721, loss=54.588, grad_norm=5.531, loss_scale=1.000, learning_rate=7.044e-05, step_time=0.501 [cnode7-012:0/16] 2024-12-08 07:42:03,567 (deepspeed_trainer:228) INFO: 33epoch:train:3801-3900batch: iter_time=1.123e-04, loss_ctc=73.535, loss_att=52.254, acc=0.733, loss=58.642, grad_norm=5.645, loss_scale=1.000, learning_rate=7.043e-05, step_time=0.338 [2024-12-08 07:42:37,388] [INFO] [logging.py:129:log_dist] [Rank 0] step=484000, skipped=0, lr=[np.float64(7.041780627632297e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:42:37,389] [INFO] [timer.py:264:stop] epoch=0/micro_step=94000/global_step=94000, RunningAvgSamplesPerSec=49.24793751210867, CurrSamplesPerSec=51.75620409734939, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:42:37,392 (deepspeed_trainer:228) INFO: 33epoch:train:3901-4000batch: iter_time=1.081e-04, loss_ctc=66.137, loss_att=54.041, acc=0.723, loss=57.655, grad_norm=5.505, loss_scale=1.000, learning_rate=7.042e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 07:43:11,009 (deepspeed_trainer:228) INFO: 33epoch:train:4001-4100batch: iter_time=1.100e-04, loss_ctc=64.289, loss_att=50.333, acc=0.728, loss=54.506, grad_norm=5.843, loss_scale=1.000, learning_rate=7.041e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 07:43:45,091 (deepspeed_trainer:228) INFO: 33epoch:train:4101-4200batch: iter_time=1.087e-04, loss_ctc=76.936, loss_att=55.880, acc=0.715, loss=62.202, grad_norm=5.962, loss_scale=1.000, learning_rate=7.041e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 07:44:18,737 (deepspeed_trainer:228) INFO: 33epoch:train:4201-4300batch: iter_time=1.098e-04, loss_ctc=72.749, loss_att=59.188, acc=0.722, loss=63.250, grad_norm=5.382, loss_scale=1.000, learning_rate=7.040e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 07:44:52,264 (deepspeed_trainer:228) INFO: 33epoch:train:4301-4400batch: iter_time=1.094e-04, loss_ctc=73.329, loss_att=53.224, acc=0.724, loss=59.242, grad_norm=5.906, loss_scale=1.000, learning_rate=7.039e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 07:45:25,939 (deepspeed_trainer:228) INFO: 33epoch:train:4401-4500batch: iter_time=1.100e-04, loss_ctc=70.739, loss_att=52.067, acc=0.726, loss=57.681, grad_norm=6.179, loss_scale=1.000, learning_rate=7.039e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 07:45:59,674 (deepspeed_trainer:228) INFO: 33epoch:train:4501-4600batch: iter_time=1.085e-04, loss_ctc=72.662, loss_att=55.829, acc=0.724, loss=60.910, grad_norm=5.266, loss_scale=1.000, learning_rate=7.038e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 07:46:33,395 (deepspeed_trainer:228) INFO: 33epoch:train:4601-4700batch: iter_time=1.069e-04, loss_ctc=83.229, loss_att=64.932, acc=0.699, loss=70.435, grad_norm=7.153, loss_scale=1.000, learning_rate=7.037e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 07:47:07,086 (deepspeed_trainer:228) INFO: 33epoch:train:4701-4800batch: iter_time=1.107e-04, loss_ctc=80.251, loss_att=57.766, acc=0.733, loss=64.499, grad_norm=5.211, loss_scale=1.000, learning_rate=7.036e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 07:47:40,259 (deepspeed_trainer:228) INFO: 33epoch:train:4801-4900batch: iter_time=1.097e-04, loss_ctc=65.843, loss_att=50.599, acc=0.719, loss=55.176, grad_norm=5.491, loss_scale=1.000, learning_rate=7.036e-05, step_time=0.331 [2024-12-08 07:48:14,114] [INFO] [logging.py:129:log_dist] [Rank 0] step=485000, skipped=0, lr=[np.float64(7.034517329050608e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:48:14,115] [INFO] [timer.py:264:stop] epoch=0/micro_step=95000/global_step=95000, RunningAvgSamplesPerSec=49.24465829230863, CurrSamplesPerSec=50.73396091671086, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:48:14,118 (deepspeed_trainer:228) INFO: 33epoch:train:4901-5000batch: iter_time=1.075e-04, loss_ctc=77.854, loss_att=58.525, acc=0.734, loss=64.339, grad_norm=5.353, loss_scale=1.000, learning_rate=7.035e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 07:48:48,354 (deepspeed_trainer:228) INFO: 33epoch:train:5001-5100batch: iter_time=1.092e-04, loss_ctc=67.343, loss_att=50.781, acc=0.726, loss=55.734, grad_norm=5.017, loss_scale=1.000, learning_rate=7.034e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 07:49:21,908 (deepspeed_trainer:228) INFO: 33epoch:train:5101-5200batch: iter_time=1.083e-04, loss_ctc=67.779, loss_att=50.139, acc=0.739, loss=55.441, grad_norm=5.779, loss_scale=1.000, learning_rate=7.033e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 07:49:55,192 (deepspeed_trainer:228) INFO: 33epoch:train:5201-5300batch: iter_time=1.112e-04, loss_ctc=70.561, loss_att=50.262, acc=0.736, loss=56.354, grad_norm=5.176, loss_scale=1.000, learning_rate=7.033e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:50:28,390 (deepspeed_trainer:228) INFO: 33epoch:train:5301-5400batch: iter_time=1.068e-04, loss_ctc=70.806, loss_att=52.507, acc=0.737, loss=58.013, grad_norm=6.018, loss_scale=1.000, learning_rate=7.032e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:51:01,689 (deepspeed_trainer:228) INFO: 33epoch:train:5401-5500batch: iter_time=1.096e-04, loss_ctc=72.978, loss_att=52.422, acc=0.731, loss=58.588, grad_norm=5.700, loss_scale=1.000, learning_rate=7.031e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:51:34,997 (deepspeed_trainer:228) INFO: 33epoch:train:5501-5600batch: iter_time=1.077e-04, loss_ctc=72.774, loss_att=53.382, acc=0.730, loss=59.199, grad_norm=5.857, loss_scale=1.000, learning_rate=7.031e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 07:51:46,775 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 07:52:13,373 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 07:52:29,076 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 07:52:29,076 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 07:52:29,078 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 07:52:54,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:54,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:56,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:57,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:57,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:56,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:57,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:57,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:59,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:57,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:59,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:59,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:52:59,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:00,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:01,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:00,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:43,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:46,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:45,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:47,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:46,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:46,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:47,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:48,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:47,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:48,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:47,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:48,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:49,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:48,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:49,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:53:51,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:30,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:35,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:35,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:35,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:36,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:35,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:35,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:37,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:37,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:36,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:37,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:38,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:38,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:38,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:39,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:54:41,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:17,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:23,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:24,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:25,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:24,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:25,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:24,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:26,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:26,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:27,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:26,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:27,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:28,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:28,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:29,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 07:55:31,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 07:56:22,213 (deepspeed_trainer:228) INFO: 33epoch:train:5601-5700batch: iter_time=2.514, loss_ctc=69.353, loss_att=50.215, acc=0.726, loss=55.952, grad_norm=5.447, loss_scale=1.000, learning_rate=7.030e-05, step_time=0.358 [cnode7-012:0/16] 2024-12-08 07:56:55,381 (deepspeed_trainer:228) INFO: 33epoch:train:5701-5800batch: iter_time=1.104e-04, loss_ctc=73.297, loss_att=55.433, acc=0.722, loss=60.756, grad_norm=6.150, loss_scale=1.000, learning_rate=7.029e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 07:57:28,628 (deepspeed_trainer:228) INFO: 33epoch:train:5801-5900batch: iter_time=1.190e-04, loss_ctc=64.013, loss_att=49.935, acc=0.731, loss=54.156, grad_norm=5.098, loss_scale=1.000, learning_rate=7.028e-05, step_time=0.332 [2024-12-08 07:58:01,575] [INFO] [logging.py:129:log_dist] [Rank 0] step=486000, skipped=0, lr=[np.float64(7.027276459558493e-05)], mom=[[0.9, 0.98]] [2024-12-08 07:58:01,576] [INFO] [timer.py:264:stop] epoch=0/micro_step=96000/global_step=96000, RunningAvgSamplesPerSec=49.24329580692828, CurrSamplesPerSec=51.38349927986879, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 07:58:01,579 (deepspeed_trainer:228) INFO: 33epoch:train:5901-6000batch: iter_time=1.085e-04, loss_ctc=65.651, loss_att=48.789, acc=0.721, loss=53.864, grad_norm=6.258, loss_scale=1.000, learning_rate=7.028e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 07:58:34,844 (deepspeed_trainer:228) INFO: 33epoch:train:6001-6100batch: iter_time=1.084e-04, loss_ctc=73.351, loss_att=54.269, acc=0.709, loss=60.006, grad_norm=6.208, loss_scale=1.000, learning_rate=7.027e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 07:59:08,257 (deepspeed_trainer:228) INFO: 33epoch:train:6101-6200batch: iter_time=1.115e-04, loss_ctc=76.794, loss_att=61.807, acc=0.716, loss=66.301, grad_norm=7.193, loss_scale=1.000, learning_rate=7.026e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 07:59:41,408 (deepspeed_trainer:228) INFO: 33epoch:train:6201-6300batch: iter_time=1.086e-04, loss_ctc=71.151, loss_att=52.822, acc=0.722, loss=58.347, grad_norm=5.678, loss_scale=1.000, learning_rate=7.025e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:00:14,306 (deepspeed_trainer:228) INFO: 33epoch:train:6301-6400batch: iter_time=1.108e-04, loss_ctc=65.316, loss_att=46.680, acc=0.737, loss=52.268, grad_norm=5.316, loss_scale=1.000, learning_rate=7.025e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:00:47,480 (deepspeed_trainer:228) INFO: 33epoch:train:6401-6500batch: iter_time=1.099e-04, loss_ctc=78.512, loss_att=62.211, acc=0.704, loss=67.098, grad_norm=7.387, loss_scale=1.000, learning_rate=7.024e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:01:20,764 (deepspeed_trainer:228) INFO: 33epoch:train:6501-6600batch: iter_time=1.075e-04, loss_ctc=85.484, loss_att=65.939, acc=0.703, loss=71.783, grad_norm=6.590, loss_scale=1.000, learning_rate=7.023e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 08:01:53,911 (deepspeed_trainer:228) INFO: 33epoch:train:6601-6700batch: iter_time=1.113e-04, loss_ctc=69.448, loss_att=49.191, acc=0.733, loss=55.280, grad_norm=5.098, loss_scale=1.000, learning_rate=7.023e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:02:26,964 (deepspeed_trainer:228) INFO: 33epoch:train:6701-6800batch: iter_time=1.098e-04, loss_ctc=68.449, loss_att=52.532, acc=0.718, loss=57.290, grad_norm=5.494, loss_scale=1.000, learning_rate=7.022e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 08:03:00,469 (deepspeed_trainer:228) INFO: 33epoch:train:6801-6900batch: iter_time=1.100e-04, loss_ctc=83.329, loss_att=63.850, acc=0.718, loss=69.683, grad_norm=6.023, loss_scale=1.000, learning_rate=7.021e-05, step_time=0.335 [2024-12-08 08:03:33,570] [INFO] [logging.py:129:log_dist] [Rank 0] step=487000, skipped=0, lr=[np.float64(7.020057903957917e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:03:33,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=97000/global_step=97000, RunningAvgSamplesPerSec=49.24780349920602, CurrSamplesPerSec=51.84548986458906, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:03:33,573 (deepspeed_trainer:228) INFO: 33epoch:train:6901-7000batch: iter_time=1.195e-04, loss_ctc=63.828, loss_att=47.131, acc=0.728, loss=52.150, grad_norm=5.132, loss_scale=1.000, learning_rate=7.020e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:04:06,247 (deepspeed_trainer:228) INFO: 33epoch:train:7001-7100batch: iter_time=1.100e-04, loss_ctc=65.978, loss_att=44.727, acc=0.734, loss=51.117, grad_norm=5.456, loss_scale=1.000, learning_rate=7.020e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 08:04:38,256 (deepspeed_trainer:228) INFO: 33epoch:train:7101-7200batch: iter_time=1.112e-04, loss_ctc=72.078, loss_att=50.931, acc=0.736, loss=57.258, grad_norm=5.996, loss_scale=1.000, learning_rate=7.019e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 08:05:11,039 (deepspeed_trainer:228) INFO: 33epoch:train:7201-7300batch: iter_time=1.088e-04, loss_ctc=72.820, loss_att=55.365, acc=0.723, loss=60.604, grad_norm=6.601, loss_scale=1.000, learning_rate=7.018e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 08:05:43,899 (deepspeed_trainer:228) INFO: 33epoch:train:7301-7400batch: iter_time=1.111e-04, loss_ctc=73.596, loss_att=54.183, acc=0.724, loss=60.002, grad_norm=5.040, loss_scale=1.000, learning_rate=7.018e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 08:06:16,579 (deepspeed_trainer:228) INFO: 33epoch:train:7401-7500batch: iter_time=1.081e-04, loss_ctc=72.751, loss_att=51.306, acc=0.732, loss=57.769, grad_norm=5.245, loss_scale=1.000, learning_rate=7.017e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 08:06:20,381 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 08:06:47,715 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 08:07:04,817 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 08:07:04,818 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 08:07:04,820 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 08:07:29,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:29,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:29,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:30,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:31,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:31,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:33,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:32,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:33,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:07:34,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:18,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:18,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:18,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:19,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:20,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:21,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:20,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:21,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:21,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:21,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:22,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:21,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:22,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:22,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:22,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:08:23,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:05,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:06,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:06,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:09,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:09,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:10,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:10,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:11,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:11,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:11,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:11,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:12,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:12,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:12,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:12,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:12,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:54,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:55,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:56,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:58,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:58,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:09:59,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:00,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:00,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:01,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:01,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:01,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:01,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:02,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:02,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:03,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:10:03,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 08:11:04,544 (deepspeed_trainer:228) INFO: 33epoch:train:7501-7600batch: iter_time=2.531, loss_ctc=72.519, loss_att=50.792, acc=0.726, loss=57.296, grad_norm=5.886, loss_scale=1.000, learning_rate=7.016e-05, step_time=0.348 [cnode7-012:0/16] 2024-12-08 08:11:38,019 (deepspeed_trainer:228) INFO: 33epoch:train:7601-7700batch: iter_time=1.086e-04, loss_ctc=64.698, loss_att=48.321, acc=0.731, loss=53.263, grad_norm=5.112, loss_scale=1.000, learning_rate=7.015e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:12:11,929 (deepspeed_trainer:228) INFO: 33epoch:train:7701-7800batch: iter_time=1.089e-04, loss_ctc=65.064, loss_att=52.213, acc=0.722, loss=56.072, grad_norm=5.277, loss_scale=1.000, learning_rate=7.015e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 08:12:45,709 (deepspeed_trainer:228) INFO: 33epoch:train:7801-7900batch: iter_time=1.067e-04, loss_ctc=69.219, loss_att=51.131, acc=0.721, loss=56.586, grad_norm=7.371, loss_scale=1.000, learning_rate=7.014e-05, step_time=0.337 [2024-12-08 08:13:19,340] [INFO] [logging.py:129:log_dist] [Rank 0] step=488000, skipped=0, lr=[np.float64(7.012861547877486e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:13:19,341] [INFO] [timer.py:264:stop] epoch=0/micro_step=98000/global_step=98000, RunningAvgSamplesPerSec=49.25089763095224, CurrSamplesPerSec=48.3632912698979, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:13:19,344 (deepspeed_trainer:228) INFO: 33epoch:train:7901-8000batch: iter_time=1.096e-04, loss_ctc=74.141, loss_att=53.957, acc=0.711, loss=60.042, grad_norm=5.723, loss_scale=1.000, learning_rate=7.013e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 08:13:52,809 (deepspeed_trainer:228) INFO: 33epoch:train:8001-8100batch: iter_time=1.089e-04, loss_ctc=71.776, loss_att=61.003, acc=0.715, loss=64.218, grad_norm=6.377, loss_scale=1.000, learning_rate=7.012e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:14:26,004 (deepspeed_trainer:228) INFO: 33epoch:train:8101-8200batch: iter_time=1.123e-04, loss_ctc=72.562, loss_att=51.077, acc=0.732, loss=57.523, grad_norm=6.118, loss_scale=1.000, learning_rate=7.012e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 08:14:59,334 (deepspeed_trainer:228) INFO: 33epoch:train:8201-8300batch: iter_time=1.093e-04, loss_ctc=69.276, loss_att=49.996, acc=0.729, loss=55.777, grad_norm=5.801, loss_scale=1.000, learning_rate=7.011e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 08:15:33,100 (deepspeed_trainer:228) INFO: 33epoch:train:8301-8400batch: iter_time=1.189e-04, loss_ctc=82.565, loss_att=65.227, acc=0.698, loss=70.437, grad_norm=7.877, loss_scale=1.000, learning_rate=7.010e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 08:16:06,865 (deepspeed_trainer:228) INFO: 33epoch:train:8401-8500batch: iter_time=1.088e-04, loss_ctc=79.139, loss_att=60.388, acc=0.712, loss=66.052, grad_norm=6.273, loss_scale=1.000, learning_rate=7.010e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 08:16:40,256 (deepspeed_trainer:228) INFO: 33epoch:train:8501-8600batch: iter_time=1.086e-04, loss_ctc=66.741, loss_att=49.383, acc=0.729, loss=54.566, grad_norm=5.383, loss_scale=1.000, learning_rate=7.009e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:17:13,399 (deepspeed_trainer:228) INFO: 33epoch:train:8601-8700batch: iter_time=1.094e-04, loss_ctc=73.537, loss_att=53.165, acc=0.728, loss=59.253, grad_norm=5.581, loss_scale=1.000, learning_rate=7.008e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:17:46,743 (deepspeed_trainer:228) INFO: 33epoch:train:8701-8800batch: iter_time=1.106e-04, loss_ctc=72.578, loss_att=55.019, acc=0.718, loss=60.293, grad_norm=5.288, loss_scale=1.000, learning_rate=7.007e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 08:18:19,900 (deepspeed_trainer:228) INFO: 33epoch:train:8801-8900batch: iter_time=1.102e-04, loss_ctc=66.558, loss_att=50.752, acc=0.727, loss=55.512, grad_norm=4.902, loss_scale=1.000, learning_rate=7.007e-05, step_time=0.331 [2024-12-08 08:18:52,970] [INFO] [logging.py:129:log_dist] [Rank 0] step=489000, skipped=0, lr=[np.float64(7.005687277764824e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:18:52,970] [INFO] [timer.py:264:stop] epoch=0/micro_step=99000/global_step=99000, RunningAvgSamplesPerSec=49.25210403254016, CurrSamplesPerSec=52.377887289826916, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:18:52,973 (deepspeed_trainer:228) INFO: 33epoch:train:8901-9000batch: iter_time=1.077e-04, loss_ctc=72.831, loss_att=49.598, acc=0.731, loss=56.574, grad_norm=5.002, loss_scale=1.000, learning_rate=7.006e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:19:26,211 (deepspeed_trainer:228) INFO: 33epoch:train:9001-9100batch: iter_time=1.113e-04, loss_ctc=71.958, loss_att=53.747, acc=0.734, loss=59.242, grad_norm=5.985, loss_scale=1.000, learning_rate=7.005e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 08:19:59,400 (deepspeed_trainer:228) INFO: 33epoch:train:9101-9200batch: iter_time=1.114e-04, loss_ctc=66.204, loss_att=46.529, acc=0.733, loss=52.423, grad_norm=5.445, loss_scale=1.000, learning_rate=7.005e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 08:20:32,782 (deepspeed_trainer:228) INFO: 33epoch:train:9201-9300batch: iter_time=1.102e-04, loss_ctc=72.335, loss_att=54.651, acc=0.720, loss=59.994, grad_norm=5.496, loss_scale=1.000, learning_rate=7.004e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 08:21:01,477 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 08:21:27,844 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 08:21:44,915 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 08:21:44,916 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 08:21:44,918 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 08:22:11,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:11,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:12,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:12,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:12,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:13,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:13,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:13,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:13,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:14,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:14,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:14,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:15,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:14,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:15,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:22:15,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:00,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:01,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:01,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:01,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:01,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:01,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:02,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:03,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:03,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:03,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:03,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:04,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:04,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:04,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:04,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:05,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:49,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:51,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:51,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:51,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:52,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:53,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:53,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:53,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:53,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:54,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:54,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:54,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:55,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:55,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:56,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:23:57,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:37,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:39,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:39,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:40,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:41,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:41,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:42,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:42,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:42,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:42,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:43,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:43,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:43,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:45,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:47,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:24:48,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 08:25:23,738 (deepspeed_trainer:228) INFO: 33epoch:train:9301-9400batch: iter_time=2.515, loss_ctc=72.554, loss_att=49.554, acc=0.736, loss=56.448, grad_norm=5.534, loss_scale=1.000, learning_rate=7.003e-05, step_time=0.394 [cnode7-012:0/16] 2024-12-08 08:25:57,090 (deepspeed_trainer:228) INFO: 33epoch:train:9401-9500batch: iter_time=1.123e-04, loss_ctc=71.769, loss_att=53.601, acc=0.721, loss=59.060, grad_norm=5.731, loss_scale=1.000, learning_rate=7.002e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 08:26:30,275 (deepspeed_trainer:228) INFO: 33epoch:train:9501-9600batch: iter_time=1.147e-04, loss_ctc=63.713, loss_att=48.262, acc=0.736, loss=52.899, grad_norm=5.246, loss_scale=1.000, learning_rate=7.002e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:27:03,926 (deepspeed_trainer:228) INFO: 33epoch:train:9601-9700batch: iter_time=1.115e-04, loss_ctc=69.204, loss_att=60.573, acc=0.717, loss=63.182, grad_norm=5.862, loss_scale=1.000, learning_rate=7.001e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 08:27:37,541 (deepspeed_trainer:228) INFO: 33epoch:train:9701-9800batch: iter_time=1.115e-04, loss_ctc=70.869, loss_att=50.313, acc=0.731, loss=56.467, grad_norm=6.140, loss_scale=1.000, learning_rate=7.000e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 08:28:10,826 (deepspeed_trainer:228) INFO: 33epoch:train:9801-9900batch: iter_time=1.133e-04, loss_ctc=74.328, loss_att=56.592, acc=0.721, loss=61.905, grad_norm=5.671, loss_scale=1.000, learning_rate=7.000e-05, step_time=0.332 [2024-12-08 08:28:43,946] [INFO] [logging.py:129:log_dist] [Rank 0] step=490000, skipped=0, lr=[np.float64(6.99853498087907e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:28:43,946] [INFO] [timer.py:264:stop] epoch=0/micro_step=100000/global_step=100000, RunningAvgSamplesPerSec=49.244928298724226, CurrSamplesPerSec=49.444070288859216, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:28:43,949 (deepspeed_trainer:228) INFO: 33epoch:train:9901-10000batch: iter_time=1.139e-04, loss_ctc=69.672, loss_att=59.206, acc=0.722, loss=62.349, grad_norm=5.997, loss_scale=1.000, learning_rate=6.999e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:29:17,718 (deepspeed_trainer:228) INFO: 33epoch:train:10001-10100batch: iter_time=1.112e-04, loss_ctc=70.786, loss_att=50.473, acc=0.726, loss=56.601, grad_norm=5.799, loss_scale=1.000, learning_rate=6.998e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 08:29:50,699 (deepspeed_trainer:228) INFO: 33epoch:train:10101-10200batch: iter_time=1.114e-04, loss_ctc=68.471, loss_att=51.801, acc=0.727, loss=56.825, grad_norm=5.128, loss_scale=1.000, learning_rate=6.997e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:30:23,892 (deepspeed_trainer:228) INFO: 33epoch:train:10201-10300batch: iter_time=1.109e-04, loss_ctc=86.627, loss_att=69.193, acc=0.700, loss=74.443, grad_norm=7.408, loss_scale=1.000, learning_rate=6.997e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:30:57,255 (deepspeed_trainer:228) INFO: 33epoch:train:10301-10400batch: iter_time=1.132e-04, loss_ctc=78.181, loss_att=58.038, acc=0.733, loss=64.069, grad_norm=5.057, loss_scale=1.000, learning_rate=6.996e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:31:30,069 (deepspeed_trainer:228) INFO: 33epoch:train:10401-10500batch: iter_time=1.113e-04, loss_ctc=65.139, loss_att=52.046, acc=0.719, loss=55.958, grad_norm=5.779, loss_scale=1.000, learning_rate=6.995e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 08:32:03,154 (deepspeed_trainer:228) INFO: 33epoch:train:10501-10600batch: iter_time=1.153e-04, loss_ctc=75.198, loss_att=52.911, acc=0.737, loss=59.580, grad_norm=5.371, loss_scale=1.000, learning_rate=6.995e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 08:32:36,075 (deepspeed_trainer:228) INFO: 33epoch:train:10601-10700batch: iter_time=1.097e-04, loss_ctc=72.239, loss_att=54.103, acc=0.732, loss=59.509, grad_norm=5.476, loss_scale=1.000, learning_rate=6.994e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:33:09,031 (deepspeed_trainer:228) INFO: 33epoch:train:10701-10800batch: iter_time=1.108e-04, loss_ctc=65.320, loss_att=50.083, acc=0.738, loss=54.620, grad_norm=4.779, loss_scale=1.000, learning_rate=6.993e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:33:42,094 (deepspeed_trainer:228) INFO: 33epoch:train:10801-10900batch: iter_time=1.091e-04, loss_ctc=71.079, loss_att=50.386, acc=0.737, loss=56.577, grad_norm=5.127, loss_scale=1.000, learning_rate=6.992e-05, step_time=0.330 [2024-12-08 08:34:15,042] [INFO] [logging.py:129:log_dist] [Rank 0] step=491000, skipped=0, lr=[np.float64(6.991404545283412e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:34:15,042] [INFO] [timer.py:264:stop] epoch=0/micro_step=101000/global_step=101000, RunningAvgSamplesPerSec=49.25100808036126, CurrSamplesPerSec=46.68753873540388, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:34:15,045 (deepspeed_trainer:228) INFO: 33epoch:train:10901-11000batch: iter_time=1.118e-04, loss_ctc=70.980, loss_att=54.035, acc=0.733, loss=59.111, grad_norm=5.647, loss_scale=1.000, learning_rate=6.992e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:34:47,989 (deepspeed_trainer:228) INFO: 33epoch:train:11001-11100batch: iter_time=1.081e-04, loss_ctc=70.763, loss_att=50.105, acc=0.734, loss=56.280, grad_norm=5.120, loss_scale=1.000, learning_rate=6.991e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:35:20,904 (deepspeed_trainer:228) INFO: 33epoch:train:11101-11200batch: iter_time=1.100e-04, loss_ctc=73.468, loss_att=56.992, acc=0.729, loss=61.927, grad_norm=5.139, loss_scale=1.000, learning_rate=6.990e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:35:41,194 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 08:36:07,671 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 08:36:23,561 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 08:36:23,561 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 08:36:23,563 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 08:36:50,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:52,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:51,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:53,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:53,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:53,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:54,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:54,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:54,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:36:54,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:39,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:39,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:40,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:40,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:40,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:40,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:41,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:41,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:42,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:42,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:42,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:42,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:43,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:43,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:43,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:37:44,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:28,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:29,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:29,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:29,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:29,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:29,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:30,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:31,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:31,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:32,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:32,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:32,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:32,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:32,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:32,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:38:35,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:18,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:18,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:19,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:19,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:19,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:19,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:19,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:20,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:21,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:21,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:21,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:21,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:22,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:22,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:22,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:39:26,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 08:40:09,633 (deepspeed_trainer:228) INFO: 33epoch:train:11201-11300batch: iter_time=2.554, loss_ctc=66.927, loss_att=47.846, acc=0.728, loss=53.573, grad_norm=5.309, loss_scale=1.000, learning_rate=6.990e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 08:40:43,051 (deepspeed_trainer:228) INFO: 33epoch:train:11301-11400batch: iter_time=1.095e-04, loss_ctc=72.588, loss_att=50.911, acc=0.736, loss=57.417, grad_norm=5.374, loss_scale=1.000, learning_rate=6.989e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:41:16,215 (deepspeed_trainer:228) INFO: 33epoch:train:11401-11500batch: iter_time=1.138e-04, loss_ctc=65.547, loss_att=53.799, acc=0.724, loss=57.320, grad_norm=4.938, loss_scale=1.000, learning_rate=6.988e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:41:49,429 (deepspeed_trainer:228) INFO: 33epoch:train:11501-11600batch: iter_time=1.109e-04, loss_ctc=63.574, loss_att=50.253, acc=0.730, loss=54.251, grad_norm=6.752, loss_scale=1.000, learning_rate=6.987e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 08:42:23,204 (deepspeed_trainer:228) INFO: 33epoch:train:11601-11700batch: iter_time=1.156e-04, loss_ctc=76.199, loss_att=55.348, acc=0.719, loss=61.583, grad_norm=5.784, loss_scale=1.000, learning_rate=6.987e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 08:42:57,043 (deepspeed_trainer:228) INFO: 33epoch:train:11701-11800batch: iter_time=1.122e-04, loss_ctc=71.020, loss_att=57.856, acc=0.727, loss=61.797, grad_norm=5.539, loss_scale=1.000, learning_rate=6.986e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 08:43:30,031 (deepspeed_trainer:228) INFO: 33epoch:train:11801-11900batch: iter_time=1.080e-04, loss_ctc=72.150, loss_att=55.031, acc=0.726, loss=60.175, grad_norm=6.417, loss_scale=1.000, learning_rate=6.985e-05, step_time=0.333 [2024-12-08 08:44:03,485] [INFO] [logging.py:129:log_dist] [Rank 0] step=492000, skipped=0, lr=[np.float64(6.984295859837743e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:44:03,486] [INFO] [timer.py:264:stop] epoch=0/micro_step=102000/global_step=102000, RunningAvgSamplesPerSec=49.25339991720851, CurrSamplesPerSec=51.876028722294336, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:44:03,489 (deepspeed_trainer:228) INFO: 33epoch:train:11901-12000batch: iter_time=1.094e-04, loss_ctc=70.044, loss_att=51.942, acc=0.728, loss=57.384, grad_norm=5.137, loss_scale=1.000, learning_rate=6.985e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:44:36,938 (deepspeed_trainer:228) INFO: 33epoch:train:12001-12100batch: iter_time=1.104e-04, loss_ctc=72.026, loss_att=55.133, acc=0.729, loss=60.209, grad_norm=5.403, loss_scale=1.000, learning_rate=6.984e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:45:10,326 (deepspeed_trainer:228) INFO: 33epoch:train:12101-12200batch: iter_time=1.128e-04, loss_ctc=81.935, loss_att=64.271, acc=0.703, loss=69.546, grad_norm=8.618, loss_scale=1.000, learning_rate=6.983e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:45:43,703 (deepspeed_trainer:228) INFO: 33epoch:train:12201-12300batch: iter_time=1.115e-04, loss_ctc=80.286, loss_att=57.634, acc=0.736, loss=64.402, grad_norm=5.602, loss_scale=1.000, learning_rate=6.983e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 08:46:16,604 (deepspeed_trainer:228) INFO: 33epoch:train:12301-12400batch: iter_time=1.110e-04, loss_ctc=65.301, loss_att=50.064, acc=0.721, loss=54.642, grad_norm=5.321, loss_scale=1.000, learning_rate=6.982e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:46:50,205 (deepspeed_trainer:228) INFO: 33epoch:train:12401-12500batch: iter_time=1.136e-04, loss_ctc=77.503, loss_att=58.617, acc=0.736, loss=64.287, grad_norm=5.190, loss_scale=1.000, learning_rate=6.981e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 08:47:23,670 (deepspeed_trainer:228) INFO: 33epoch:train:12501-12600batch: iter_time=1.126e-04, loss_ctc=66.783, loss_att=50.371, acc=0.729, loss=55.285, grad_norm=5.106, loss_scale=1.000, learning_rate=6.980e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 08:47:57,221 (deepspeed_trainer:228) INFO: 33epoch:train:12601-12700batch: iter_time=1.132e-04, loss_ctc=67.008, loss_att=49.443, acc=0.742, loss=54.671, grad_norm=4.852, loss_scale=1.000, learning_rate=6.980e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 08:48:30,304 (deepspeed_trainer:228) INFO: 33epoch:train:12701-12800batch: iter_time=1.111e-04, loss_ctc=69.823, loss_att=50.126, acc=0.737, loss=56.057, grad_norm=4.765, loss_scale=1.000, learning_rate=6.979e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:49:03,407 (deepspeed_trainer:228) INFO: 33epoch:train:12801-12900batch: iter_time=1.133e-04, loss_ctc=69.806, loss_att=52.676, acc=0.738, loss=57.812, grad_norm=6.052, loss_scale=1.000, learning_rate=6.978e-05, step_time=0.331 [2024-12-08 08:49:36,606] [INFO] [logging.py:129:log_dist] [Rank 0] step=493000, skipped=0, lr=[np.float64(6.977208814191385e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:49:36,606] [INFO] [timer.py:264:stop] epoch=0/micro_step=103000/global_step=103000, RunningAvgSamplesPerSec=49.256328049650605, CurrSamplesPerSec=42.50753490137523, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:49:36,609 (deepspeed_trainer:228) INFO: 33epoch:train:12901-13000batch: iter_time=1.108e-04, loss_ctc=72.492, loss_att=52.184, acc=0.733, loss=58.262, grad_norm=5.686, loss_scale=1.000, learning_rate=6.978e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:50:09,523 (deepspeed_trainer:228) INFO: 33epoch:train:13001-13100batch: iter_time=1.102e-04, loss_ctc=72.750, loss_att=53.737, acc=0.732, loss=59.422, grad_norm=5.336, loss_scale=1.000, learning_rate=6.977e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:50:21,518 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 08:50:48,504 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 08:51:06,483 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 08:51:06,483 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 08:51:06,485 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 08:51:32,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:32,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:32,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:32,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:33,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:33,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:33,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:33,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:33,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:33,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:34,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:34,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:34,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:34,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:35,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:51:35,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:21,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:22,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:22,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:22,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:22,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:23,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:23,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:23,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:24,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:24,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:24,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:24,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:25,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:26,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:26,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:52:26,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:10,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:10,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:10,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:11,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:11,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:11,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:12,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:12,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:13,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:14,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:14,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:15,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:15,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:17,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:19,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:19,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:58,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:59,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:59,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:59,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:53:59,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:01,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:02,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:02,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:03,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:03,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:03,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:04,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:05,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:09,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:10,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 08:54:10,852] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 08:55:02,055 (deepspeed_trainer:228) INFO: 33epoch:train:13101-13200batch: iter_time=2.536, loss_ctc=68.932, loss_att=49.905, acc=0.728, loss=55.607, grad_norm=5.442, loss_scale=1.000, learning_rate=6.976e-05, step_time=0.389 [cnode7-012:0/16] 2024-12-08 08:55:35,041 (deepspeed_trainer:228) INFO: 33epoch:train:13201-13300batch: iter_time=1.117e-04, loss_ctc=72.126, loss_att=56.017, acc=0.722, loss=60.867, grad_norm=5.282, loss_scale=1.000, learning_rate=6.975e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:56:08,174 (deepspeed_trainer:228) INFO: 33epoch:train:13301-13400batch: iter_time=1.078e-04, loss_ctc=63.481, loss_att=49.977, acc=0.731, loss=54.022, grad_norm=4.883, loss_scale=1.000, learning_rate=6.975e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 08:56:40,892 (deepspeed_trainer:228) INFO: 33epoch:train:13401-13500batch: iter_time=1.094e-04, loss_ctc=64.425, loss_att=49.454, acc=0.723, loss=53.957, grad_norm=6.212, loss_scale=1.000, learning_rate=6.974e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 08:57:13,817 (deepspeed_trainer:228) INFO: 33epoch:train:13501-13600batch: iter_time=1.112e-04, loss_ctc=72.658, loss_att=53.483, acc=0.712, loss=59.249, grad_norm=5.618, loss_scale=1.000, learning_rate=6.973e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:57:46,854 (deepspeed_trainer:228) INFO: 33epoch:train:13601-13700batch: iter_time=1.096e-04, loss_ctc=77.683, loss_att=61.928, acc=0.716, loss=66.652, grad_norm=6.603, loss_scale=1.000, learning_rate=6.973e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 08:58:19,648 (deepspeed_trainer:228) INFO: 33epoch:train:13701-13800batch: iter_time=1.104e-04, loss_ctc=71.070, loss_att=52.663, acc=0.723, loss=58.162, grad_norm=5.396, loss_scale=1.000, learning_rate=6.972e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 08:58:52,415 (deepspeed_trainer:228) INFO: 33epoch:train:13801-13900batch: iter_time=1.126e-04, loss_ctc=64.280, loss_att=45.944, acc=0.741, loss=51.462, grad_norm=4.794, loss_scale=1.000, learning_rate=6.971e-05, step_time=0.327 [2024-12-08 08:59:25,328] [INFO] [logging.py:129:log_dist] [Rank 0] step=494000, skipped=0, lr=[np.float64(6.970143298775889e-05)], mom=[[0.9, 0.98]] [2024-12-08 08:59:25,328] [INFO] [timer.py:264:stop] epoch=0/micro_step=104000/global_step=104000, RunningAvgSamplesPerSec=49.25557492300067, CurrSamplesPerSec=48.861021844116785, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 08:59:25,331 (deepspeed_trainer:228) INFO: 33epoch:train:13901-14000batch: iter_time=1.089e-04, loss_ctc=79.907, loss_att=62.240, acc=0.704, loss=67.528, grad_norm=6.940, loss_scale=1.000, learning_rate=6.970e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 08:59:58,402 (deepspeed_trainer:228) INFO: 33epoch:train:14001-14100batch: iter_time=1.129e-04, loss_ctc=83.614, loss_att=64.822, acc=0.707, loss=70.438, grad_norm=6.353, loss_scale=1.000, learning_rate=6.970e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:00:31,212 (deepspeed_trainer:228) INFO: 33epoch:train:14101-14200batch: iter_time=1.113e-04, loss_ctc=69.689, loss_att=49.678, acc=0.731, loss=55.676, grad_norm=5.759, loss_scale=1.000, learning_rate=6.969e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 09:01:04,242 (deepspeed_trainer:228) INFO: 33epoch:train:14201-14300batch: iter_time=1.137e-04, loss_ctc=68.109, loss_att=52.134, acc=0.720, loss=56.939, grad_norm=4.983, loss_scale=1.000, learning_rate=6.968e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:01:37,295 (deepspeed_trainer:228) INFO: 33epoch:train:14301-14400batch: iter_time=1.224e-04, loss_ctc=82.146, loss_att=63.523, acc=0.720, loss=69.091, grad_norm=5.686, loss_scale=1.000, learning_rate=6.968e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:02:09,955 (deepspeed_trainer:228) INFO: 33epoch:train:14401-14500batch: iter_time=1.112e-04, loss_ctc=63.259, loss_att=46.804, acc=0.729, loss=51.743, grad_norm=4.984, loss_scale=1.000, learning_rate=6.967e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 09:02:42,966 (deepspeed_trainer:228) INFO: 33epoch:train:14501-14600batch: iter_time=1.106e-04, loss_ctc=65.221, loss_att=44.760, acc=0.735, loss=50.923, grad_norm=4.883, loss_scale=1.000, learning_rate=6.966e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:03:16,734 (deepspeed_trainer:228) INFO: 33epoch:train:14601-14700batch: iter_time=1.115e-04, loss_ctc=71.808, loss_att=50.624, acc=0.738, loss=56.996, grad_norm=5.155, loss_scale=1.000, learning_rate=6.966e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 09:03:50,374 (deepspeed_trainer:228) INFO: 33epoch:train:14701-14800batch: iter_time=1.085e-04, loss_ctc=71.938, loss_att=56.881, acc=0.722, loss=61.407, grad_norm=6.065, loss_scale=1.000, learning_rate=6.965e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 09:04:23,921 (deepspeed_trainer:228) INFO: 33epoch:train:14801-14900batch: iter_time=1.097e-04, loss_ctc=73.234, loss_att=54.215, acc=0.724, loss=59.935, grad_norm=5.505, loss_scale=1.000, learning_rate=6.964e-05, step_time=0.335 [2024-12-08 09:04:56,582] [INFO] [logging.py:129:log_dist] [Rank 0] step=495000, skipped=0, lr=[np.float64(6.963099204797927e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:04:56,583] [INFO] [timer.py:264:stop] epoch=0/micro_step=105000/global_step=105000, RunningAvgSamplesPerSec=49.26026490768916, CurrSamplesPerSec=53.029558850812684, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:04:56,585 (deepspeed_trainer:228) INFO: 33epoch:train:14901-15000batch: iter_time=1.108e-04, loss_ctc=72.672, loss_att=51.232, acc=0.732, loss=57.676, grad_norm=5.184, loss_scale=1.000, learning_rate=6.963e-05, step_time=0.326 [2024-12-08 09:05:12,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:11,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:12,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:27,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:28,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:28,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:28,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:28,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:28,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:29,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:44,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:45,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:45,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:45,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:45,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:46,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:46,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:46,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:46,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:46,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:05:59,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:00,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:00,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:00,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:00,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:01,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:00,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:01,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:01,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:01,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:02,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:02,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:02,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:02,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:02,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:02,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:06:15,543] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 33 is about to be saved! [2024-12-08 09:06:15,577] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/mp_rank_00_model_states.pt [2024-12-08 09:06:15,577] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/mp_rank_00_model_states.pt... [2024-12-08 09:06:17,522] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/mp_rank_00_model_states.pt. [2024-12-08 09:06:17,679] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,679] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,511] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,681] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,512] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,513] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,514] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,515] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,515] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,684] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,516] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,685] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,517] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,686] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,686] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 09:06:17,687] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 09:06:18,120] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,120] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,120] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,384] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,384] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,384] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,255] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,255] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,255] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,298] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,298] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,298] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,469] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,471] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,471] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,471] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,316] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,316] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,322] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,322] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,322] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,495] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,496] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,496] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,502] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,502] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,502] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,502] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,502] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,502] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,338] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,338] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,338] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,520] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,520] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,520] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,523] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,523] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,523] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,537] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,537] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,373] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,374] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,374] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [2024-12-08 09:06:18,374] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 09:06:18,375] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_33/33/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 09:06:18,375] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 33 is ready now! [cnode7-012:0/16] 2024-12-08 09:06:18,549 (deepspeed_trainer:158) INFO: 33epoch results: [train] iter_time=0.134, loss_ctc=71.882, loss_att=53.778, acc=0.725, loss=59.210, grad_norm=5.697, loss_scale=1.000, learning_rate=7.017e-05, step_time=0.335, time=1 hour, 57 minutes and 30.82 seconds, total_count=495033, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.703, cer_ctc=0.067, loss_att=5.750, acc=0.835, cer=0.401, wer=1.000, loss=5.125, time=1 minute and 10.15 seconds, total_count=33, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 09:06:22,484 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 09:06:49,736 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 09:07:05,634 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 09:07:05,634 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 09:07:05,636 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 09:07:23,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:24,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:24,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:24,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:24,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:25,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:25,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:25,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:25,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:25,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:25,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:26,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:26,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:26,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:27,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:07:29,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:12,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:13,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:13,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:13,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:14,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:14,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:14,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:14,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:14,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:15,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:15,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:16,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:17,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:17,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:17,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:18,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:08:59,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:01,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:01,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:02,403] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:02,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:03,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:03,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:03,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:04,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:06,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:06,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:06,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:06,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:07,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:08,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:08,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:47,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:48,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:49,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:49,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:50,294] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:51,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:51,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:51,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:52,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:54,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:54,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:55,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:56,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:57,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:57,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:09:59,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 09:11:00,483 (deepspeed_trainer:228) INFO: 34epoch:train:1-100batch: iter_time=2.403, loss_ctc=75.402, loss_att=57.830, acc=0.713, loss=63.102, grad_norm=5.949, loss_scale=1.000, learning_rate=6.963e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-08 09:11:34,012 (deepspeed_trainer:228) INFO: 34epoch:train:101-200batch: iter_time=1.167e-04, loss_ctc=64.746, loss_att=57.583, acc=0.707, loss=59.788, grad_norm=5.240, loss_scale=1.000, learning_rate=6.962e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 09:12:07,064 (deepspeed_trainer:228) INFO: 34epoch:train:201-300batch: iter_time=1.075e-04, loss_ctc=67.357, loss_att=49.517, acc=0.723, loss=54.844, grad_norm=5.996, loss_scale=1.000, learning_rate=6.961e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:12:40,485 (deepspeed_trainer:228) INFO: 34epoch:train:301-400batch: iter_time=1.106e-04, loss_ctc=70.208, loss_att=56.634, acc=0.713, loss=60.711, grad_norm=5.785, loss_scale=1.000, learning_rate=6.961e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 09:13:13,796 (deepspeed_trainer:228) INFO: 34epoch:train:401-500batch: iter_time=1.064e-04, loss_ctc=76.825, loss_att=52.315, acc=0.725, loss=59.673, grad_norm=6.701, loss_scale=1.000, learning_rate=6.960e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:13:47,105 (deepspeed_trainer:228) INFO: 34epoch:train:501-600batch: iter_time=1.103e-04, loss_ctc=76.263, loss_att=58.108, acc=0.713, loss=63.562, grad_norm=5.903, loss_scale=1.000, learning_rate=6.959e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:14:20,222 (deepspeed_trainer:228) INFO: 34epoch:train:601-700batch: iter_time=1.077e-04, loss_ctc=68.320, loss_att=51.713, acc=0.710, loss=56.690, grad_norm=5.662, loss_scale=1.000, learning_rate=6.959e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 09:14:53,442 (deepspeed_trainer:228) INFO: 34epoch:train:701-800batch: iter_time=1.075e-04, loss_ctc=63.598, loss_att=48.837, acc=0.724, loss=53.263, grad_norm=5.423, loss_scale=1.000, learning_rate=6.958e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 09:15:27,055 (deepspeed_trainer:228) INFO: 34epoch:train:801-900batch: iter_time=1.071e-04, loss_ctc=77.492, loss_att=55.782, acc=0.714, loss=62.267, grad_norm=6.359, loss_scale=1.000, learning_rate=6.957e-05, step_time=0.336 [2024-12-08 09:16:00,328] [INFO] [logging.py:129:log_dist] [Rank 0] step=496000, skipped=0, lr=[np.float64(6.956076424232244e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:16:00,329] [INFO] [timer.py:264:stop] epoch=0/micro_step=106000/global_step=106000, RunningAvgSamplesPerSec=49.2564963814746, CurrSamplesPerSec=51.58556961228104, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:16:00,332 (deepspeed_trainer:228) INFO: 34epoch:train:901-1000batch: iter_time=1.124e-04, loss_ctc=72.791, loss_att=56.454, acc=0.727, loss=61.351, grad_norm=5.583, loss_scale=1.000, learning_rate=6.956e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:16:33,511 (deepspeed_trainer:228) INFO: 34epoch:train:1001-1100batch: iter_time=1.100e-04, loss_ctc=70.730, loss_att=52.403, acc=0.723, loss=57.842, grad_norm=5.857, loss_scale=1.000, learning_rate=6.956e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 09:17:06,639 (deepspeed_trainer:228) INFO: 34epoch:train:1101-1200batch: iter_time=1.125e-04, loss_ctc=68.085, loss_att=50.204, acc=0.738, loss=55.594, grad_norm=5.586, loss_scale=1.000, learning_rate=6.955e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 09:17:40,346 (deepspeed_trainer:228) INFO: 34epoch:train:1201-1300batch: iter_time=2.128e-04, loss_ctc=68.765, loss_att=51.229, acc=0.727, loss=56.494, grad_norm=5.183, loss_scale=1.000, learning_rate=6.954e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 09:18:13,533 (deepspeed_trainer:228) INFO: 34epoch:train:1301-1400batch: iter_time=1.139e-04, loss_ctc=68.927, loss_att=47.038, acc=0.730, loss=53.604, grad_norm=5.375, loss_scale=1.000, learning_rate=6.954e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 09:18:46,859 (deepspeed_trainer:228) INFO: 34epoch:train:1401-1500batch: iter_time=1.137e-04, loss_ctc=66.955, loss_att=51.384, acc=0.715, loss=56.051, grad_norm=5.452, loss_scale=1.000, learning_rate=6.953e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:19:20,196 (deepspeed_trainer:228) INFO: 34epoch:train:1501-1600batch: iter_time=1.151e-04, loss_ctc=75.165, loss_att=57.602, acc=0.713, loss=62.865, grad_norm=6.498, loss_scale=1.000, learning_rate=6.952e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:19:53,727 (deepspeed_trainer:228) INFO: 34epoch:train:1601-1700batch: iter_time=1.110e-04, loss_ctc=78.187, loss_att=60.014, acc=0.710, loss=65.463, grad_norm=5.971, loss_scale=1.000, learning_rate=6.952e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 09:20:27,023 (deepspeed_trainer:228) INFO: 34epoch:train:1701-1800batch: iter_time=1.131e-04, loss_ctc=62.569, loss_att=46.401, acc=0.722, loss=51.252, grad_norm=5.424, loss_scale=1.000, learning_rate=6.951e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:20:56,373 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 09:21:22,895 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 09:21:41,312 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 09:21:41,312 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 09:21:41,315 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 09:22:10,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:10,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:10,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:10,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:10,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:10,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:11,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:11,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:22:13,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:01,658] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:01,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:01,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:02,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:02,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:03,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:03,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:03,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:04,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:05,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:05,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:06,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:06,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:06,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:06,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:07,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:51,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:51,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:51,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:51,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:51,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:53,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:53,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:53,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:53,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:54,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:55,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:55,975] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:56,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:56,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:57,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:23:58,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:39,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:40,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:40,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:40,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:41,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:42,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:43,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:43,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:43,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:44,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:44,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:45,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:45,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:46,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:46,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:24:48,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 09:25:24,816 (deepspeed_trainer:228) INFO: 34epoch:train:1801-1900batch: iter_time=2.578, loss_ctc=75.719, loss_att=56.417, acc=0.726, loss=62.161, grad_norm=5.665, loss_scale=1.000, learning_rate=6.950e-05, step_time=0.399 [2024-12-08 09:25:58,263] [INFO] [logging.py:129:log_dist] [Rank 0] step=497000, skipped=0, lr=[np.float64(6.949074849814717e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:25:58,266] [INFO] [timer.py:264:stop] epoch=0/micro_step=107000/global_step=107000, RunningAvgSamplesPerSec=49.24963969635319, CurrSamplesPerSec=49.50436103252372, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:25:58,269 (deepspeed_trainer:228) INFO: 34epoch:train:1901-2000batch: iter_time=1.385e-04, loss_ctc=65.586, loss_att=50.161, acc=0.716, loss=54.779, grad_norm=5.220, loss_scale=1.000, learning_rate=6.949e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 09:26:31,980 (deepspeed_trainer:228) INFO: 34epoch:train:2001-2100batch: iter_time=1.127e-04, loss_ctc=69.912, loss_att=56.664, acc=0.715, loss=60.633, grad_norm=5.491, loss_scale=1.000, learning_rate=6.949e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 09:27:05,197 (deepspeed_trainer:228) INFO: 34epoch:train:2101-2200batch: iter_time=1.066e-04, loss_ctc=64.657, loss_att=47.925, acc=0.720, loss=52.949, grad_norm=5.449, loss_scale=1.000, learning_rate=6.948e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 09:27:38,688 (deepspeed_trainer:228) INFO: 34epoch:train:2201-2300batch: iter_time=1.079e-04, loss_ctc=73.699, loss_att=58.191, acc=0.711, loss=62.845, grad_norm=5.421, loss_scale=1.000, learning_rate=6.947e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 09:28:12,250 (deepspeed_trainer:228) INFO: 34epoch:train:2301-2400batch: iter_time=1.087e-04, loss_ctc=75.189, loss_att=51.934, acc=0.718, loss=58.941, grad_norm=6.272, loss_scale=1.000, learning_rate=6.947e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 09:28:45,658 (deepspeed_trainer:228) INFO: 34epoch:train:2401-2500batch: iter_time=1.087e-04, loss_ctc=74.184, loss_att=54.562, acc=0.714, loss=60.459, grad_norm=5.522, loss_scale=1.000, learning_rate=6.946e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 09:29:19,615 (deepspeed_trainer:228) INFO: 34epoch:train:2501-2600batch: iter_time=1.074e-04, loss_ctc=67.992, loss_att=53.471, acc=0.707, loss=57.819, grad_norm=5.867, loss_scale=1.000, learning_rate=6.945e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 09:29:53,098 (deepspeed_trainer:228) INFO: 34epoch:train:2601-2700batch: iter_time=1.108e-04, loss_ctc=63.343, loss_att=48.172, acc=0.720, loss=52.728, grad_norm=5.321, loss_scale=1.000, learning_rate=6.945e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 09:30:26,718 (deepspeed_trainer:228) INFO: 34epoch:train:2701-2800batch: iter_time=1.086e-04, loss_ctc=72.632, loss_att=50.009, acc=0.727, loss=56.762, grad_norm=6.133, loss_scale=1.000, learning_rate=6.944e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 09:31:00,194 (deepspeed_trainer:228) INFO: 34epoch:train:2801-2900batch: iter_time=1.101e-04, loss_ctc=68.510, loss_att=52.478, acc=0.724, loss=57.307, grad_norm=6.089, loss_scale=1.000, learning_rate=6.943e-05, step_time=0.335 [2024-12-08 09:31:33,478] [INFO] [logging.py:129:log_dist] [Rank 0] step=498000, skipped=0, lr=[np.float64(6.94209437503546e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:31:33,478] [INFO] [timer.py:264:stop] epoch=0/micro_step=108000/global_step=108000, RunningAvgSamplesPerSec=49.24896070811553, CurrSamplesPerSec=49.00123288568553, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:31:33,481 (deepspeed_trainer:228) INFO: 34epoch:train:2901-3000batch: iter_time=1.103e-04, loss_ctc=75.127, loss_att=53.528, acc=0.724, loss=59.972, grad_norm=6.347, loss_scale=1.000, learning_rate=6.942e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 09:32:06,864 (deepspeed_trainer:228) INFO: 34epoch:train:3001-3100batch: iter_time=1.107e-04, loss_ctc=64.456, loss_att=50.023, acc=0.733, loss=54.336, grad_norm=5.087, loss_scale=1.000, learning_rate=6.942e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:32:39,962 (deepspeed_trainer:228) INFO: 34epoch:train:3101-3200batch: iter_time=1.119e-04, loss_ctc=69.619, loss_att=51.401, acc=0.722, loss=56.866, grad_norm=6.001, loss_scale=1.000, learning_rate=6.941e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 09:33:13,089 (deepspeed_trainer:228) INFO: 34epoch:train:3201-3300batch: iter_time=1.107e-04, loss_ctc=67.925, loss_att=45.511, acc=0.729, loss=52.260, grad_norm=4.959, loss_scale=1.000, learning_rate=6.940e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 09:33:46,367 (deepspeed_trainer:228) INFO: 34epoch:train:3301-3400batch: iter_time=1.102e-04, loss_ctc=72.229, loss_att=58.874, acc=0.704, loss=62.871, grad_norm=6.824, loss_scale=1.000, learning_rate=6.940e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:34:19,929 (deepspeed_trainer:228) INFO: 34epoch:train:3401-3500batch: iter_time=1.090e-04, loss_ctc=73.621, loss_att=53.831, acc=0.709, loss=59.747, grad_norm=5.896, loss_scale=1.000, learning_rate=6.939e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 09:34:53,489 (deepspeed_trainer:228) INFO: 34epoch:train:3501-3600batch: iter_time=1.140e-04, loss_ctc=69.246, loss_att=54.582, acc=0.724, loss=58.956, grad_norm=5.283, loss_scale=1.000, learning_rate=6.938e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 09:35:26,801 (deepspeed_trainer:228) INFO: 34epoch:train:3601-3700batch: iter_time=1.105e-04, loss_ctc=62.154, loss_att=44.727, acc=0.726, loss=49.949, grad_norm=5.785, loss_scale=1.000, learning_rate=6.938e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 09:35:46,880 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 09:36:13,655 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 09:36:30,527 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 09:36:30,527 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 09:36:30,530 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 09:36:56,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:57,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:57,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:58,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:58,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:59,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:59,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:59,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:59,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:36:59,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:00,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:00,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:00,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:00,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:00,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:01,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:44,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:45,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:46,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:46,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:48,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:48,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:48,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:49,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:49,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:49,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:49,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:49,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:49,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:50,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:51,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:37:53,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:32,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:33,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:35,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:35,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:36,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:36,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:38,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:37,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:38,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:38,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:38,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:38,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:39,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:41,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:43,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:38:43,371] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:21,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:22,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:23,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:23,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:24,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:25,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:26,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:26,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:27,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:27,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:27,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:27,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:29,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:31,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:32,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:39:33,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 09:40:17,174 (deepspeed_trainer:228) INFO: 34epoch:train:3701-3800batch: iter_time=2.504, loss_ctc=81.327, loss_att=64.026, acc=0.710, loss=69.253, grad_norm=5.747, loss_scale=1.000, learning_rate=6.937e-05, step_time=0.404 [cnode7-012:0/16] 2024-12-08 09:40:50,312 (deepspeed_trainer:228) INFO: 34epoch:train:3801-3900batch: iter_time=1.042e-04, loss_ctc=60.798, loss_att=48.394, acc=0.720, loss=52.129, grad_norm=4.927, loss_scale=1.000, learning_rate=6.936e-05, step_time=0.331 [2024-12-08 09:41:23,764] [INFO] [logging.py:129:log_dist] [Rank 0] step=499000, skipped=0, lr=[np.float64(6.935134894132025e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:41:23,765] [INFO] [timer.py:264:stop] epoch=0/micro_step=109000/global_step=109000, RunningAvgSamplesPerSec=49.241267097506054, CurrSamplesPerSec=52.06276644434653, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:41:23,768 (deepspeed_trainer:228) INFO: 34epoch:train:3901-4000batch: iter_time=1.078e-04, loss_ctc=69.702, loss_att=55.182, acc=0.724, loss=59.547, grad_norm=5.328, loss_scale=1.000, learning_rate=6.935e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 09:41:56,634 (deepspeed_trainer:228) INFO: 34epoch:train:4001-4100batch: iter_time=1.151e-04, loss_ctc=67.805, loss_att=54.879, acc=0.713, loss=58.749, grad_norm=5.708, loss_scale=1.000, learning_rate=6.935e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 09:42:29,609 (deepspeed_trainer:228) INFO: 34epoch:train:4101-4200batch: iter_time=1.078e-04, loss_ctc=71.453, loss_att=51.181, acc=0.721, loss=57.252, grad_norm=5.112, loss_scale=1.000, learning_rate=6.934e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:43:02,674 (deepspeed_trainer:228) INFO: 34epoch:train:4201-4300batch: iter_time=1.079e-04, loss_ctc=76.607, loss_att=56.221, acc=0.724, loss=62.332, grad_norm=5.947, loss_scale=1.000, learning_rate=6.933e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:43:35,658 (deepspeed_trainer:228) INFO: 34epoch:train:4301-4400batch: iter_time=1.096e-04, loss_ctc=74.041, loss_att=51.983, acc=0.724, loss=58.612, grad_norm=6.214, loss_scale=1.000, learning_rate=6.933e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 09:44:08,408 (deepspeed_trainer:228) INFO: 34epoch:train:4401-4500batch: iter_time=1.068e-04, loss_ctc=61.666, loss_att=49.971, acc=0.716, loss=53.487, grad_norm=5.456, loss_scale=1.000, learning_rate=6.932e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 09:44:41,198 (deepspeed_trainer:228) INFO: 34epoch:train:4501-4600batch: iter_time=1.069e-04, loss_ctc=70.636, loss_att=51.170, acc=0.725, loss=57.003, grad_norm=5.448, loss_scale=1.000, learning_rate=6.931e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 09:45:13,880 (deepspeed_trainer:228) INFO: 34epoch:train:4601-4700batch: iter_time=1.086e-04, loss_ctc=68.290, loss_att=49.861, acc=0.733, loss=55.381, grad_norm=5.650, loss_scale=1.000, learning_rate=6.931e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 09:45:46,982 (deepspeed_trainer:228) INFO: 34epoch:train:4701-4800batch: iter_time=1.063e-04, loss_ctc=71.703, loss_att=56.195, acc=0.721, loss=60.864, grad_norm=5.589, loss_scale=1.000, learning_rate=6.930e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:46:20,210 (deepspeed_trainer:228) INFO: 34epoch:train:4801-4900batch: iter_time=1.098e-04, loss_ctc=73.607, loss_att=51.725, acc=0.735, loss=58.280, grad_norm=6.310, loss_scale=1.000, learning_rate=6.929e-05, step_time=0.332 [2024-12-08 09:46:53,655] [INFO] [logging.py:129:log_dist] [Rank 0] step=500000, skipped=0, lr=[np.float64(6.928196302082671e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:46:53,656] [INFO] [timer.py:264:stop] epoch=0/micro_step=110000/global_step=110000, RunningAvgSamplesPerSec=49.24799596973058, CurrSamplesPerSec=50.12795826851834, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:46:53,658 (deepspeed_trainer:228) INFO: 34epoch:train:4901-5000batch: iter_time=1.102e-04, loss_ctc=67.882, loss_att=53.570, acc=0.734, loss=57.841, grad_norm=5.182, loss_scale=1.000, learning_rate=6.929e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 09:47:27,810 (deepspeed_trainer:228) INFO: 34epoch:train:5001-5100batch: iter_time=1.086e-04, loss_ctc=67.336, loss_att=46.591, acc=0.731, loss=52.808, grad_norm=5.228, loss_scale=1.000, learning_rate=6.928e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 09:48:00,681 (deepspeed_trainer:228) INFO: 34epoch:train:5101-5200batch: iter_time=1.081e-04, loss_ctc=62.527, loss_att=43.642, acc=0.728, loss=49.294, grad_norm=4.805, loss_scale=1.000, learning_rate=6.927e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 09:48:34,496 (deepspeed_trainer:228) INFO: 34epoch:train:5201-5300batch: iter_time=1.076e-04, loss_ctc=76.357, loss_att=62.122, acc=0.714, loss=66.412, grad_norm=6.111, loss_scale=1.000, learning_rate=6.926e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 09:49:07,566 (deepspeed_trainer:228) INFO: 34epoch:train:5301-5400batch: iter_time=1.078e-04, loss_ctc=70.472, loss_att=52.465, acc=0.715, loss=57.861, grad_norm=5.913, loss_scale=1.000, learning_rate=6.926e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:49:40,567 (deepspeed_trainer:228) INFO: 34epoch:train:5401-5500batch: iter_time=1.120e-04, loss_ctc=68.438, loss_att=51.977, acc=0.723, loss=56.908, grad_norm=5.829, loss_scale=1.000, learning_rate=6.925e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:50:13,871 (deepspeed_trainer:228) INFO: 34epoch:train:5501-5600batch: iter_time=1.080e-04, loss_ctc=67.055, loss_att=47.076, acc=0.731, loss=53.076, grad_norm=5.134, loss_scale=1.000, learning_rate=6.924e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 09:50:25,644 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 09:50:52,458 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 09:51:08,617 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 09:51:08,618 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 09:51:08,621 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 09:51:33,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:33,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:35,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:35,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:36,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:36,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:37,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:37,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:37,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:37,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:37,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:38,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:38,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:38,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:39,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:51:39,429] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:21,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:22,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:24,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:24,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:25,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:26,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:26,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:26,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:27,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:28,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:28,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:29,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:29,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:29,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:30,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:52:31,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:10,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:12,950] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:14,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:15,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:14,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:15,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:15,343] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:15,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:16,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:16,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:16,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:16,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:17,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:19,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:21,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:22,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:53:57,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:01,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:02,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:03,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:03,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:03,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:04,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:04,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:05,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:05,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:05,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:05,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:06,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:08,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:12,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 09:54:12,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 09:55:05,528 (deepspeed_trainer:228) INFO: 34epoch:train:5601-5700batch: iter_time=2.498, loss_ctc=78.810, loss_att=59.807, acc=0.723, loss=65.523, grad_norm=5.929, loss_scale=1.000, learning_rate=6.924e-05, step_time=0.418 [cnode7-012:0/16] 2024-12-08 09:55:38,624 (deepspeed_trainer:228) INFO: 34epoch:train:5701-5800batch: iter_time=1.061e-04, loss_ctc=57.218, loss_att=49.152, acc=0.706, loss=51.568, grad_norm=5.115, loss_scale=1.000, learning_rate=6.923e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:56:11,880 (deepspeed_trainer:228) INFO: 34epoch:train:5801-5900batch: iter_time=1.051e-04, loss_ctc=69.095, loss_att=51.743, acc=0.724, loss=56.938, grad_norm=5.316, loss_scale=1.000, learning_rate=6.922e-05, step_time=0.332 [2024-12-08 09:56:45,085] [INFO] [logging.py:129:log_dist] [Rank 0] step=501000, skipped=0, lr=[np.float64(6.921278494599706e-05)], mom=[[0.9, 0.98]] [2024-12-08 09:56:45,086] [INFO] [timer.py:264:stop] epoch=0/micro_step=111000/global_step=111000, RunningAvgSamplesPerSec=49.23895068274884, CurrSamplesPerSec=50.137957762658544, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 09:56:45,089 (deepspeed_trainer:228) INFO: 34epoch:train:5901-6000batch: iter_time=1.081e-04, loss_ctc=69.442, loss_att=53.705, acc=0.716, loss=58.440, grad_norm=5.693, loss_scale=1.000, learning_rate=6.922e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 09:57:18,143 (deepspeed_trainer:228) INFO: 34epoch:train:6001-6100batch: iter_time=1.101e-04, loss_ctc=75.108, loss_att=54.328, acc=0.721, loss=60.565, grad_norm=6.389, loss_scale=1.000, learning_rate=6.921e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:57:51,147 (deepspeed_trainer:228) INFO: 34epoch:train:6101-6200batch: iter_time=1.063e-04, loss_ctc=76.057, loss_att=56.554, acc=0.711, loss=62.384, grad_norm=5.530, loss_scale=1.000, learning_rate=6.920e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 09:58:23,689 (deepspeed_trainer:228) INFO: 34epoch:train:6201-6300batch: iter_time=1.079e-04, loss_ctc=66.165, loss_att=46.973, acc=0.723, loss=52.757, grad_norm=6.041, loss_scale=1.000, learning_rate=6.920e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 09:58:56,616 (deepspeed_trainer:228) INFO: 34epoch:train:6301-6400batch: iter_time=1.071e-04, loss_ctc=65.674, loss_att=52.598, acc=0.709, loss=56.533, grad_norm=5.323, loss_scale=1.000, learning_rate=6.919e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 09:59:29,427 (deepspeed_trainer:228) INFO: 34epoch:train:6401-6500batch: iter_time=1.064e-04, loss_ctc=71.361, loss_att=53.082, acc=0.715, loss=58.549, grad_norm=5.912, loss_scale=1.000, learning_rate=6.918e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:00:02,338 (deepspeed_trainer:228) INFO: 34epoch:train:6501-6600batch: iter_time=1.099e-04, loss_ctc=71.172, loss_att=51.513, acc=0.739, loss=57.396, grad_norm=5.700, loss_scale=1.000, learning_rate=6.917e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:00:35,212 (deepspeed_trainer:228) INFO: 34epoch:train:6601-6700batch: iter_time=1.073e-04, loss_ctc=66.230, loss_att=50.980, acc=0.718, loss=55.537, grad_norm=5.375, loss_scale=1.000, learning_rate=6.917e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:01:08,165 (deepspeed_trainer:228) INFO: 34epoch:train:6701-6800batch: iter_time=1.070e-04, loss_ctc=71.843, loss_att=51.998, acc=0.735, loss=57.958, grad_norm=6.154, loss_scale=1.000, learning_rate=6.916e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:01:41,602 (deepspeed_trainer:228) INFO: 34epoch:train:6801-6900batch: iter_time=1.212e-04, loss_ctc=66.464, loss_att=51.924, acc=0.730, loss=56.282, grad_norm=5.073, loss_scale=1.000, learning_rate=6.915e-05, step_time=0.332 [2024-12-08 10:02:15,277] [INFO] [logging.py:129:log_dist] [Rank 0] step=502000, skipped=0, lr=[np.float64(6.914381368122907e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:02:15,278] [INFO] [timer.py:264:stop] epoch=0/micro_step=112000/global_step=112000, RunningAvgSamplesPerSec=49.245493552232716, CurrSamplesPerSec=47.42417181912626, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:02:15,281 (deepspeed_trainer:228) INFO: 34epoch:train:6901-7000batch: iter_time=1.397e-04, loss_ctc=70.366, loss_att=47.148, acc=0.728, loss=54.119, grad_norm=5.999, loss_scale=1.000, learning_rate=6.915e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 10:02:48,103 (deepspeed_trainer:228) INFO: 34epoch:train:7001-7100batch: iter_time=1.085e-04, loss_ctc=63.531, loss_att=46.205, acc=0.720, loss=51.407, grad_norm=5.385, loss_scale=1.000, learning_rate=6.914e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:03:21,107 (deepspeed_trainer:228) INFO: 34epoch:train:7101-7200batch: iter_time=1.092e-04, loss_ctc=72.445, loss_att=59.402, acc=0.715, loss=63.328, grad_norm=5.584, loss_scale=1.000, learning_rate=6.913e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 10:03:53,874 (deepspeed_trainer:228) INFO: 34epoch:train:7201-7300batch: iter_time=1.085e-04, loss_ctc=74.197, loss_att=54.973, acc=0.709, loss=60.768, grad_norm=6.295, loss_scale=1.000, learning_rate=6.913e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 10:04:26,362 (deepspeed_trainer:228) INFO: 34epoch:train:7301-7400batch: iter_time=1.103e-04, loss_ctc=65.188, loss_att=49.856, acc=0.715, loss=54.438, grad_norm=6.522, loss_scale=1.000, learning_rate=6.912e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 10:04:59,016 (deepspeed_trainer:228) INFO: 34epoch:train:7401-7500batch: iter_time=1.069e-04, loss_ctc=68.664, loss_att=48.475, acc=0.734, loss=54.527, grad_norm=4.955, loss_scale=1.000, learning_rate=6.911e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 10:05:02,828 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 10:05:30,806 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 10:05:47,964 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 10:05:47,964 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 10:05:47,966 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 10:06:11,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:12,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:12,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:12,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:12,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:12,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:13,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:13,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:13,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:13,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:14,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:14,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:14,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:15,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:15,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:06:17,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:01,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:01,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:02,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:02,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:02,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:02,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:02,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:02,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:03,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:03,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:03,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:04,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:04,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:04,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:05,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:06,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:50,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:50,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:51,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:51,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:51,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:52,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:52,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:52,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:52,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:53,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:53,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:53,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:53,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:54,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:55,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:07:56,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:38,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:38,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:39,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:39,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:40,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:40,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:41,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:41,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:42,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:42,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:42,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:42,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:42,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:44,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:44,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:08:44,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 10:09:47,448 (deepspeed_trainer:228) INFO: 34epoch:train:7501-7600batch: iter_time=2.547, loss_ctc=73.943, loss_att=55.604, acc=0.715, loss=61.079, grad_norm=5.551, loss_scale=1.000, learning_rate=6.911e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 10:10:20,686 (deepspeed_trainer:228) INFO: 34epoch:train:7601-7700batch: iter_time=1.115e-04, loss_ctc=64.007, loss_att=55.997, acc=0.709, loss=58.364, grad_norm=5.301, loss_scale=1.000, learning_rate=6.910e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 10:10:53,860 (deepspeed_trainer:228) INFO: 34epoch:train:7701-7800batch: iter_time=1.066e-04, loss_ctc=65.762, loss_att=45.741, acc=0.728, loss=51.705, grad_norm=5.067, loss_scale=1.000, learning_rate=6.909e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 10:11:27,217 (deepspeed_trainer:228) INFO: 34epoch:train:7801-7900batch: iter_time=1.084e-04, loss_ctc=68.898, loss_att=54.694, acc=0.720, loss=58.941, grad_norm=5.201, loss_scale=1.000, learning_rate=6.909e-05, step_time=0.333 [2024-12-08 10:12:00,412] [INFO] [logging.py:129:log_dist] [Rank 0] step=503000, skipped=0, lr=[np.float64(6.907504819813006e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:12:00,413] [INFO] [timer.py:264:stop] epoch=0/micro_step=113000/global_step=113000, RunningAvgSamplesPerSec=49.25165101185902, CurrSamplesPerSec=51.74471089687406, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:12:00,416 (deepspeed_trainer:228) INFO: 34epoch:train:7901-8000batch: iter_time=1.128e-04, loss_ctc=74.835, loss_att=51.124, acc=0.723, loss=58.225, grad_norm=6.317, loss_scale=1.000, learning_rate=6.908e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 10:12:33,412 (deepspeed_trainer:228) INFO: 34epoch:train:8001-8100batch: iter_time=1.103e-04, loss_ctc=74.991, loss_att=55.264, acc=0.714, loss=61.170, grad_norm=6.161, loss_scale=1.000, learning_rate=6.907e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 10:13:06,359 (deepspeed_trainer:228) INFO: 34epoch:train:8101-8200batch: iter_time=1.131e-04, loss_ctc=66.928, loss_att=49.513, acc=0.717, loss=54.758, grad_norm=5.531, loss_scale=1.000, learning_rate=6.906e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:13:39,268 (deepspeed_trainer:228) INFO: 34epoch:train:8201-8300batch: iter_time=1.087e-04, loss_ctc=62.678, loss_att=48.623, acc=0.720, loss=52.841, grad_norm=6.093, loss_scale=1.000, learning_rate=6.906e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:14:12,393 (deepspeed_trainer:228) INFO: 34epoch:train:8301-8400batch: iter_time=1.113e-04, loss_ctc=74.222, loss_att=53.804, acc=0.716, loss=59.922, grad_norm=6.670, loss_scale=1.000, learning_rate=6.905e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 10:14:45,506 (deepspeed_trainer:228) INFO: 34epoch:train:8401-8500batch: iter_time=1.076e-04, loss_ctc=71.853, loss_att=56.679, acc=0.724, loss=61.244, grad_norm=6.682, loss_scale=1.000, learning_rate=6.904e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 10:15:18,407 (deepspeed_trainer:228) INFO: 34epoch:train:8501-8600batch: iter_time=1.122e-04, loss_ctc=68.847, loss_att=51.193, acc=0.722, loss=56.493, grad_norm=6.223, loss_scale=1.000, learning_rate=6.904e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:15:51,283 (deepspeed_trainer:228) INFO: 34epoch:train:8601-8700batch: iter_time=1.087e-04, loss_ctc=67.029, loss_att=49.163, acc=0.742, loss=54.520, grad_norm=5.405, loss_scale=1.000, learning_rate=6.903e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:16:24,176 (deepspeed_trainer:228) INFO: 34epoch:train:8701-8800batch: iter_time=1.080e-04, loss_ctc=67.547, loss_att=50.355, acc=0.726, loss=55.535, grad_norm=6.090, loss_scale=1.000, learning_rate=6.902e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:16:56,936 (deepspeed_trainer:228) INFO: 34epoch:train:8801-8900batch: iter_time=1.067e-04, loss_ctc=67.909, loss_att=45.832, acc=0.732, loss=52.441, grad_norm=6.884, loss_scale=1.000, learning_rate=6.902e-05, step_time=0.327 [2024-12-08 10:17:29,744] [INFO] [logging.py:129:log_dist] [Rank 0] step=504000, skipped=0, lr=[np.float64(6.900648747545244e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:17:29,744] [INFO] [timer.py:264:stop] epoch=0/micro_step=114000/global_step=114000, RunningAvgSamplesPerSec=49.25865705460753, CurrSamplesPerSec=52.72363243480913, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:17:29,747 (deepspeed_trainer:228) INFO: 34epoch:train:8901-9000batch: iter_time=1.105e-04, loss_ctc=66.115, loss_att=49.978, acc=0.721, loss=54.821, grad_norm=6.016, loss_scale=1.000, learning_rate=6.901e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:18:02,631 (deepspeed_trainer:228) INFO: 34epoch:train:9001-9100batch: iter_time=1.095e-04, loss_ctc=72.211, loss_att=55.132, acc=0.715, loss=60.278, grad_norm=7.008, loss_scale=1.000, learning_rate=6.900e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:18:35,542 (deepspeed_trainer:228) INFO: 34epoch:train:9101-9200batch: iter_time=1.113e-04, loss_ctc=75.506, loss_att=58.462, acc=0.713, loss=63.571, grad_norm=6.872, loss_scale=1.000, learning_rate=6.900e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:19:08,211 (deepspeed_trainer:228) INFO: 34epoch:train:9201-9300batch: iter_time=1.083e-04, loss_ctc=61.491, loss_att=45.264, acc=0.722, loss=50.132, grad_norm=5.900, loss_scale=1.000, learning_rate=6.899e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 10:19:36,768 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 10:20:02,993 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 10:20:20,128 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 10:20:20,129 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 10:20:20,131 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 10:20:44,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:45,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:46,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:45,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:46,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:46,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:46,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:48,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:47,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:20:49,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:34,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:35,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:34,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:34,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:34,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:36,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:35,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:36,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:36,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:38,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:38,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:38,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:39,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:38,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:39,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:21:41,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:21,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:22,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:23,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:24,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:24,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:24,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:25,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:26,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:27,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:26,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:26,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:28,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:29,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:28,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:30,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:22:33,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:10,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:11,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:12,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:13,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:13,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:13,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:15,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:14,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:15,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:16,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:15,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:17,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:17,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:17,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:20,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:23:23,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 10:23:59,063 (deepspeed_trainer:228) INFO: 34epoch:train:9301-9400batch: iter_time=2.522, loss_ctc=74.544, loss_att=55.596, acc=0.727, loss=61.276, grad_norm=5.962, loss_scale=1.000, learning_rate=6.898e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-08 10:24:32,696 (deepspeed_trainer:228) INFO: 34epoch:train:9401-9500batch: iter_time=1.085e-04, loss_ctc=64.798, loss_att=50.654, acc=0.719, loss=54.891, grad_norm=5.986, loss_scale=1.000, learning_rate=6.898e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 10:25:06,554 (deepspeed_trainer:228) INFO: 34epoch:train:9501-9600batch: iter_time=1.196e-04, loss_ctc=69.171, loss_att=56.170, acc=0.724, loss=60.060, grad_norm=5.492, loss_scale=1.000, learning_rate=6.897e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 10:25:40,273 (deepspeed_trainer:228) INFO: 34epoch:train:9601-9700batch: iter_time=1.102e-04, loss_ctc=64.278, loss_att=49.399, acc=0.726, loss=53.869, grad_norm=5.635, loss_scale=1.000, learning_rate=6.896e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 10:26:13,735 (deepspeed_trainer:228) INFO: 34epoch:train:9701-9800batch: iter_time=1.141e-04, loss_ctc=72.790, loss_att=58.193, acc=0.718, loss=62.558, grad_norm=5.891, loss_scale=1.000, learning_rate=6.896e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 10:26:47,561 (deepspeed_trainer:228) INFO: 34epoch:train:9801-9900batch: iter_time=1.113e-04, loss_ctc=73.186, loss_att=50.183, acc=0.727, loss=57.093, grad_norm=5.526, loss_scale=1.000, learning_rate=6.895e-05, step_time=0.338 [2024-12-08 10:27:20,997] [INFO] [logging.py:129:log_dist] [Rank 0] step=505000, skipped=0, lr=[np.float64(6.893813049903017e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:27:20,997] [INFO] [timer.py:264:stop] epoch=0/micro_step=115000/global_step=115000, RunningAvgSamplesPerSec=49.25277439569968, CurrSamplesPerSec=52.02696508912616, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:27:21,000 (deepspeed_trainer:228) INFO: 34epoch:train:9901-10000batch: iter_time=1.148e-04, loss_ctc=74.028, loss_att=55.570, acc=0.723, loss=61.084, grad_norm=5.524, loss_scale=1.000, learning_rate=6.894e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 10:27:55,028 (deepspeed_trainer:228) INFO: 34epoch:train:10001-10100batch: iter_time=1.104e-04, loss_ctc=67.107, loss_att=54.377, acc=0.711, loss=58.187, grad_norm=5.582, loss_scale=1.000, learning_rate=6.893e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 10:28:28,342 (deepspeed_trainer:228) INFO: 34epoch:train:10101-10200batch: iter_time=1.130e-04, loss_ctc=62.894, loss_att=47.555, acc=0.729, loss=52.144, grad_norm=5.671, loss_scale=1.000, learning_rate=6.893e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:29:01,839 (deepspeed_trainer:228) INFO: 34epoch:train:10201-10300batch: iter_time=1.131e-04, loss_ctc=72.085, loss_att=50.382, acc=0.731, loss=56.894, grad_norm=5.539, loss_scale=1.000, learning_rate=6.892e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 10:29:35,121 (deepspeed_trainer:228) INFO: 34epoch:train:10301-10400batch: iter_time=1.136e-04, loss_ctc=68.565, loss_att=52.078, acc=0.730, loss=57.036, grad_norm=5.792, loss_scale=1.000, learning_rate=6.891e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:30:08,410 (deepspeed_trainer:228) INFO: 34epoch:train:10401-10500batch: iter_time=1.072e-04, loss_ctc=73.427, loss_att=53.414, acc=0.734, loss=59.443, grad_norm=6.469, loss_scale=1.000, learning_rate=6.891e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:30:41,113 (deepspeed_trainer:228) INFO: 34epoch:train:10501-10600batch: iter_time=1.110e-04, loss_ctc=63.470, loss_att=49.543, acc=0.741, loss=53.711, grad_norm=5.441, loss_scale=1.000, learning_rate=6.890e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 10:31:15,105 (deepspeed_trainer:228) INFO: 34epoch:train:10601-10700batch: iter_time=1.108e-04, loss_ctc=68.849, loss_att=50.451, acc=0.730, loss=55.947, grad_norm=5.520, loss_scale=1.000, learning_rate=6.889e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 10:31:48,644 (deepspeed_trainer:228) INFO: 34epoch:train:10701-10800batch: iter_time=1.124e-04, loss_ctc=67.435, loss_att=45.649, acc=0.734, loss=52.184, grad_norm=5.175, loss_scale=1.000, learning_rate=6.889e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 10:32:22,342 (deepspeed_trainer:228) INFO: 34epoch:train:10801-10900batch: iter_time=1.089e-04, loss_ctc=72.098, loss_att=58.539, acc=0.710, loss=62.575, grad_norm=6.397, loss_scale=1.000, learning_rate=6.888e-05, step_time=0.337 [2024-12-08 10:32:55,808] [INFO] [logging.py:129:log_dist] [Rank 0] step=506000, skipped=0, lr=[np.float64(6.886997626171559e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:32:55,809] [INFO] [timer.py:264:stop] epoch=0/micro_step=116000/global_step=116000, RunningAvgSamplesPerSec=49.25274292389762, CurrSamplesPerSec=52.12034551215144, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:32:55,811 (deepspeed_trainer:228) INFO: 34epoch:train:10901-11000batch: iter_time=1.122e-04, loss_ctc=72.506, loss_att=53.767, acc=0.718, loss=59.383, grad_norm=6.531, loss_scale=1.000, learning_rate=6.887e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 10:33:28,880 (deepspeed_trainer:228) INFO: 34epoch:train:11001-11100batch: iter_time=1.104e-04, loss_ctc=68.619, loss_att=53.865, acc=0.728, loss=58.307, grad_norm=5.892, loss_scale=1.000, learning_rate=6.887e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 10:34:01,528 (deepspeed_trainer:228) INFO: 34epoch:train:11101-11200batch: iter_time=1.145e-04, loss_ctc=60.760, loss_att=44.110, acc=0.732, loss=49.114, grad_norm=5.888, loss_scale=1.000, learning_rate=6.886e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 10:34:21,610 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 10:34:48,228 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 10:35:05,891 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 10:35:05,891 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 10:35:05,893 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 10:35:31,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:31,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:31,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:32,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:32,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:33,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:33,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:33,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:33,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:34,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:34,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:34,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:34,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:34,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:35,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:35:36,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:21,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:20,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:21,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:21,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:22,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:22,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:23,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:23,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:24,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:24,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:25,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:28,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:29,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:29,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:29,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:36:29,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:12,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:14,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:14,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:15,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:15,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:15,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:15,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:16,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:17,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:16,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:18,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:19,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:20,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:20,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:21,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:37:23,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:01,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:03,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:03,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:05,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:04,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:05,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:05,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:05,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:05,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:05,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:07,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:07,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:09,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:09,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:09,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:38:15,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 10:38:58,286 (deepspeed_trainer:228) INFO: 34epoch:train:11201-11300batch: iter_time=2.591, loss_ctc=80.059, loss_att=62.945, acc=0.717, loss=68.085, grad_norm=6.483, loss_scale=1.000, learning_rate=6.885e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-08 10:39:32,318 (deepspeed_trainer:228) INFO: 34epoch:train:11301-11400batch: iter_time=1.074e-04, loss_ctc=60.148, loss_att=47.888, acc=0.722, loss=51.593, grad_norm=5.463, loss_scale=1.000, learning_rate=6.885e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 10:40:06,056 (deepspeed_trainer:228) INFO: 34epoch:train:11401-11500batch: iter_time=1.068e-04, loss_ctc=69.240, loss_att=54.474, acc=0.726, loss=58.901, grad_norm=5.559, loss_scale=1.000, learning_rate=6.884e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 10:40:39,767 (deepspeed_trainer:228) INFO: 34epoch:train:11501-11600batch: iter_time=1.090e-04, loss_ctc=66.539, loss_att=53.768, acc=0.719, loss=57.594, grad_norm=6.098, loss_scale=1.000, learning_rate=6.883e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 10:41:13,333 (deepspeed_trainer:228) INFO: 34epoch:train:11601-11700batch: iter_time=1.093e-04, loss_ctc=71.221, loss_att=50.907, acc=0.725, loss=56.994, grad_norm=5.085, loss_scale=1.000, learning_rate=6.883e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 10:41:47,002 (deepspeed_trainer:228) INFO: 34epoch:train:11701-11800batch: iter_time=1.082e-04, loss_ctc=75.448, loss_att=55.495, acc=0.727, loss=61.487, grad_norm=5.687, loss_scale=1.000, learning_rate=6.882e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 10:42:20,508 (deepspeed_trainer:228) INFO: 34epoch:train:11801-11900batch: iter_time=1.075e-04, loss_ctc=73.456, loss_att=51.626, acc=0.728, loss=58.168, grad_norm=5.530, loss_scale=1.000, learning_rate=6.881e-05, step_time=0.335 [2024-12-08 10:42:53,838] [INFO] [logging.py:129:log_dist] [Rank 0] step=507000, skipped=0, lr=[np.float64(6.880202376331716e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:42:53,838] [INFO] [timer.py:264:stop] epoch=0/micro_step=117000/global_step=117000, RunningAvgSamplesPerSec=49.24815861125274, CurrSamplesPerSec=49.68295525452164, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:42:53,841 (deepspeed_trainer:228) INFO: 34epoch:train:11901-12000batch: iter_time=1.071e-04, loss_ctc=60.714, loss_att=49.361, acc=0.721, loss=52.762, grad_norm=5.006, loss_scale=1.000, learning_rate=6.881e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:43:27,444 (deepspeed_trainer:228) INFO: 34epoch:train:12001-12100batch: iter_time=1.114e-04, loss_ctc=69.846, loss_att=51.103, acc=0.726, loss=56.733, grad_norm=5.872, loss_scale=1.000, learning_rate=6.880e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 10:44:00,770 (deepspeed_trainer:228) INFO: 34epoch:train:12101-12200batch: iter_time=1.062e-04, loss_ctc=66.590, loss_att=49.060, acc=0.738, loss=54.328, grad_norm=5.566, loss_scale=1.000, learning_rate=6.879e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:44:34,162 (deepspeed_trainer:228) INFO: 34epoch:train:12201-12300batch: iter_time=1.075e-04, loss_ctc=71.176, loss_att=55.312, acc=0.721, loss=60.054, grad_norm=5.680, loss_scale=1.000, learning_rate=6.879e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:45:07,771 (deepspeed_trainer:228) INFO: 34epoch:train:12301-12400batch: iter_time=1.098e-04, loss_ctc=72.291, loss_att=50.943, acc=0.738, loss=57.353, grad_norm=5.935, loss_scale=1.000, learning_rate=6.878e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 10:45:41,237 (deepspeed_trainer:228) INFO: 34epoch:train:12401-12500batch: iter_time=1.095e-04, loss_ctc=68.503, loss_att=53.312, acc=0.734, loss=57.873, grad_norm=5.260, loss_scale=1.000, learning_rate=6.877e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 10:46:14,243 (deepspeed_trainer:228) INFO: 34epoch:train:12501-12600batch: iter_time=1.102e-04, loss_ctc=66.541, loss_att=46.268, acc=0.731, loss=52.348, grad_norm=5.454, loss_scale=1.000, learning_rate=6.876e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:46:47,315 (deepspeed_trainer:228) INFO: 34epoch:train:12601-12700batch: iter_time=1.112e-04, loss_ctc=61.654, loss_att=43.268, acc=0.732, loss=48.780, grad_norm=5.580, loss_scale=1.000, learning_rate=6.876e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 10:47:21,258 (deepspeed_trainer:228) INFO: 34epoch:train:12701-12800batch: iter_time=1.075e-04, loss_ctc=74.938, loss_att=61.541, acc=0.718, loss=65.538, grad_norm=7.517, loss_scale=1.000, learning_rate=6.875e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 10:47:54,421 (deepspeed_trainer:228) INFO: 34epoch:train:12801-12900batch: iter_time=1.068e-04, loss_ctc=69.384, loss_att=52.218, acc=0.717, loss=57.401, grad_norm=5.568, loss_scale=1.000, learning_rate=6.874e-05, step_time=0.336 [2024-12-08 10:48:27,680] [INFO] [logging.py:129:log_dist] [Rank 0] step=508000, skipped=0, lr=[np.float64(6.873427201053784e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:48:27,680] [INFO] [timer.py:264:stop] epoch=0/micro_step=118000/global_step=118000, RunningAvgSamplesPerSec=49.249242920770286, CurrSamplesPerSec=49.90600488810827, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:48:27,683 (deepspeed_trainer:228) INFO: 34epoch:train:12901-13000batch: iter_time=1.088e-04, loss_ctc=67.310, loss_att=51.380, acc=0.726, loss=56.156, grad_norm=6.186, loss_scale=1.000, learning_rate=6.874e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 10:49:01,008 (deepspeed_trainer:228) INFO: 34epoch:train:13001-13100batch: iter_time=1.085e-04, loss_ctc=66.039, loss_att=46.369, acc=0.735, loss=52.261, grad_norm=5.621, loss_scale=1.000, learning_rate=6.873e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:49:13,353 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 10:49:41,149 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 10:49:56,488 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 10:49:56,488 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 10:49:56,490 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 10:50:23,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:22,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:24,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:23,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:23,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:24,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:25,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:24,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:24,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:24,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:25,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:25,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:25,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:25,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:26,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:50:26,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:11,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:12,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:12,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:12,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:12,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:13,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:13,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:13,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:13,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:13,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:14,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:14,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:14,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:14,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:14,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:51:15,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:01,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:00,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:01,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:01,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:02,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:02,995] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:03,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:03,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:03,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:03,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:04,311] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:04,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:04,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:04,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:06,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:07,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:49,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:50,158] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:50,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:50,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:51,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:51,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:51,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:52,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:52,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:51,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:52,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:53,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:53,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:53,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:56,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 10:52:57,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 10:53:50,108 (deepspeed_trainer:228) INFO: 34epoch:train:13101-13200batch: iter_time=2.503, loss_ctc=78.448, loss_att=60.402, acc=0.721, loss=65.801, grad_norm=6.170, loss_scale=1.000, learning_rate=6.872e-05, step_time=0.388 [cnode7-012:0/16] 2024-12-08 10:54:23,235 (deepspeed_trainer:228) INFO: 34epoch:train:13201-13300batch: iter_time=1.087e-04, loss_ctc=56.370, loss_att=48.990, acc=0.707, loss=51.219, grad_norm=5.074, loss_scale=1.000, learning_rate=6.872e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 10:54:56,258 (deepspeed_trainer:228) INFO: 34epoch:train:13301-13400batch: iter_time=1.064e-04, loss_ctc=68.506, loss_att=51.568, acc=0.726, loss=56.657, grad_norm=5.112, loss_scale=1.000, learning_rate=6.871e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 10:55:29,379 (deepspeed_trainer:228) INFO: 34epoch:train:13401-13500batch: iter_time=1.081e-04, loss_ctc=68.996, loss_att=53.611, acc=0.718, loss=58.262, grad_norm=5.915, loss_scale=1.000, learning_rate=6.870e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 10:56:03,118 (deepspeed_trainer:228) INFO: 34epoch:train:13501-13600batch: iter_time=1.051e-04, loss_ctc=74.208, loss_att=53.748, acc=0.724, loss=59.869, grad_norm=5.005, loss_scale=1.000, learning_rate=6.870e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 10:56:36,005 (deepspeed_trainer:228) INFO: 34epoch:train:13601-13700batch: iter_time=1.095e-04, loss_ctc=75.296, loss_att=55.780, acc=0.714, loss=61.654, grad_norm=5.779, loss_scale=1.000, learning_rate=6.869e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 10:57:08,872 (deepspeed_trainer:228) INFO: 34epoch:train:13701-13800batch: iter_time=1.076e-04, loss_ctc=66.045, loss_att=47.288, acc=0.724, loss=52.899, grad_norm=5.465, loss_scale=1.000, learning_rate=6.868e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 10:57:42,108 (deepspeed_trainer:228) INFO: 34epoch:train:13801-13900batch: iter_time=1.089e-04, loss_ctc=64.801, loss_att=52.136, acc=0.711, loss=55.957, grad_norm=5.685, loss_scale=1.000, learning_rate=6.868e-05, step_time=0.332 [2024-12-08 10:58:15,476] [INFO] [logging.py:129:log_dist] [Rank 0] step=509000, skipped=0, lr=[np.float64(6.866672001691402e-05)], mom=[[0.9, 0.98]] [2024-12-08 10:58:15,477] [INFO] [timer.py:264:stop] epoch=0/micro_step=119000/global_step=119000, RunningAvgSamplesPerSec=49.245665880510536, CurrSamplesPerSec=44.75881138897005, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 10:58:15,480 (deepspeed_trainer:228) INFO: 34epoch:train:13901-14000batch: iter_time=1.113e-04, loss_ctc=70.779, loss_att=53.582, acc=0.715, loss=58.732, grad_norm=5.640, loss_scale=1.000, learning_rate=6.867e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 10:58:48,917 (deepspeed_trainer:228) INFO: 34epoch:train:14001-14100batch: iter_time=1.110e-04, loss_ctc=70.081, loss_att=51.366, acc=0.739, loss=57.012, grad_norm=5.544, loss_scale=1.000, learning_rate=6.866e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 10:59:22,067 (deepspeed_trainer:228) INFO: 34epoch:train:14101-14200batch: iter_time=1.093e-04, loss_ctc=65.784, loss_att=51.439, acc=0.718, loss=55.755, grad_norm=5.838, loss_scale=1.000, learning_rate=6.866e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 10:59:55,507 (deepspeed_trainer:228) INFO: 34epoch:train:14201-14300batch: iter_time=1.116e-04, loss_ctc=71.770, loss_att=51.216, acc=0.739, loss=57.385, grad_norm=6.090, loss_scale=1.000, learning_rate=6.865e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 11:00:28,652 (deepspeed_trainer:228) INFO: 34epoch:train:14301-14400batch: iter_time=1.088e-04, loss_ctc=66.254, loss_att=51.627, acc=0.731, loss=56.007, grad_norm=5.012, loss_scale=1.000, learning_rate=6.864e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:01:02,013 (deepspeed_trainer:228) INFO: 34epoch:train:14401-14500batch: iter_time=1.112e-04, loss_ctc=68.760, loss_att=45.924, acc=0.732, loss=52.765, grad_norm=5.910, loss_scale=1.000, learning_rate=6.864e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:01:35,088 (deepspeed_trainer:228) INFO: 34epoch:train:14501-14600batch: iter_time=1.093e-04, loss_ctc=63.121, loss_att=45.294, acc=0.724, loss=50.634, grad_norm=5.113, loss_scale=1.000, learning_rate=6.863e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:02:08,349 (deepspeed_trainer:228) INFO: 34epoch:train:14601-14700batch: iter_time=1.097e-04, loss_ctc=71.875, loss_att=58.842, acc=0.718, loss=62.777, grad_norm=5.929, loss_scale=1.000, learning_rate=6.862e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:02:41,582 (deepspeed_trainer:228) INFO: 34epoch:train:14701-14800batch: iter_time=1.102e-04, loss_ctc=72.898, loss_att=54.412, acc=0.711, loss=59.956, grad_norm=6.311, loss_scale=1.000, learning_rate=6.862e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:03:14,807 (deepspeed_trainer:228) INFO: 34epoch:train:14801-14900batch: iter_time=1.121e-04, loss_ctc=65.312, loss_att=50.100, acc=0.716, loss=54.666, grad_norm=5.482, loss_scale=1.000, learning_rate=6.861e-05, step_time=0.332 [2024-12-08 11:03:47,810] [INFO] [logging.py:129:log_dist] [Rank 0] step=510000, skipped=0, lr=[np.float64(6.859936680275532e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:03:47,811] [INFO] [timer.py:264:stop] epoch=0/micro_step=120000/global_step=120000, RunningAvgSamplesPerSec=49.24867378061678, CurrSamplesPerSec=52.97300683358952, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:03:47,814 (deepspeed_trainer:228) INFO: 34epoch:train:14901-15000batch: iter_time=1.074e-04, loss_ctc=68.011, loss_att=48.107, acc=0.736, loss=54.086, grad_norm=5.665, loss_scale=1.000, learning_rate=6.860e-05, step_time=0.330 [2024-12-08 11:04:03,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:02,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:04,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:03,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:04,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:04,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:04,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:04,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:18,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:18,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:19,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:18,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:19,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:20,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:19,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:20,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:20,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:19,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:20,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:21,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:21,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:21,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:20,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:20,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:34,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:34,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:34,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:35,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:35,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:35,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:35,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:37,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:37,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:36,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:49,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:49,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:50,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:51,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:51,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:51,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:52,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:51,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:52,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:51,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:53,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:53,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:53,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:52,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:52,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:04:52,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:05:06,204] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 34 is about to be saved! [2024-12-08 11:05:06,237] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/mp_rank_00_model_states.pt [2024-12-08 11:05:06,237] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/mp_rank_00_model_states.pt... [2024-12-08 11:05:08,127] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/mp_rank_00_model_states.pt. [2024-12-08 11:05:08,284] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,283] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,288] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,288] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,484] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,484] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,485] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,290] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,485] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,291] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,291] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 11:05:07,486] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 11:05:08,291] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 11:05:09,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,021] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,021] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,036] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,036] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,051] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,051] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,052] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,061] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,066] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,066] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,262] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,262] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,262] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,262] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,262] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,262] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,076] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,076] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,076] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,081] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,081] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,081] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,082] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,083] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,083] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,280] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,280] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,280] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:09,099] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 11:05:09,099] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 11:05:09,099] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,316] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,316] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [2024-12-08 11:05:08,319] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 11:05:08,320] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_34/34/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 11:05:08,320] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 34 is ready now! [cnode7-012:0/16] 2024-12-08 11:05:09,130 (deepspeed_trainer:158) INFO: 34epoch results: [train] iter_time=0.134, loss_ctc=69.458, loss_att=52.245, acc=0.722, loss=57.408, grad_norm=5.742, loss_scale=1.000, learning_rate=6.911e-05, step_time=0.335, time=1 hour, 57 minutes and 34.59 seconds, total_count=510034, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.734, cer_ctc=0.065, loss_att=6.812, acc=0.788, cer=0.440, wer=1.000, loss=5.906, time=1 minute and 9.52 seconds, total_count=34, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 11:05:11,466 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 11:05:38,237 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 11:05:55,246 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 11:05:55,246 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 11:05:55,249 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 11:06:15,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:14,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:16,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:15,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:16,486] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:16,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:16,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:16,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:17,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:06:18,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:02,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:03,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:03,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:05,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:04,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:04,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:06,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:05,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:06,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:06,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:07,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:06,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:07,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:08,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:10,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:09,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:49,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:49,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:50,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:53,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:52,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:52,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:53,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:52,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:55,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:55,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:54,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:55,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:54,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:56,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:07:58,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:00,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:38,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:37,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:39,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:40,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:40,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:41,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:41,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:42,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:43,581] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:42,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:44,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:43,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:44,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:45,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:46,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:08:51,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 11:09:51,236 (deepspeed_trainer:228) INFO: 35epoch:train:1-100batch: iter_time=2.400, loss_ctc=69.031, loss_att=50.867, acc=0.718, loss=56.311, grad_norm=5.904, loss_scale=1.000, learning_rate=6.860e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-08 11:10:24,519 (deepspeed_trainer:228) INFO: 35epoch:train:101-200batch: iter_time=1.095e-04, loss_ctc=80.295, loss_att=56.382, acc=0.729, loss=63.572, grad_norm=5.874, loss_scale=1.000, learning_rate=6.859e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:10:57,687 (deepspeed_trainer:228) INFO: 35epoch:train:201-300batch: iter_time=1.091e-04, loss_ctc=73.288, loss_att=52.882, acc=0.726, loss=59.008, grad_norm=5.304, loss_scale=1.000, learning_rate=6.858e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:11:30,912 (deepspeed_trainer:228) INFO: 35epoch:train:301-400batch: iter_time=1.111e-04, loss_ctc=68.751, loss_att=49.238, acc=0.721, loss=55.075, grad_norm=5.731, loss_scale=1.000, learning_rate=6.858e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:12:04,196 (deepspeed_trainer:228) INFO: 35epoch:train:401-500batch: iter_time=1.190e-04, loss_ctc=83.689, loss_att=60.135, acc=0.702, loss=67.177, grad_norm=5.969, loss_scale=1.000, learning_rate=6.857e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:12:37,280 (deepspeed_trainer:228) INFO: 35epoch:train:501-600batch: iter_time=1.084e-04, loss_ctc=64.462, loss_att=46.443, acc=0.723, loss=51.822, grad_norm=5.081, loss_scale=1.000, learning_rate=6.856e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:13:10,637 (deepspeed_trainer:228) INFO: 35epoch:train:601-700batch: iter_time=1.076e-04, loss_ctc=72.777, loss_att=55.337, acc=0.719, loss=60.563, grad_norm=5.684, loss_scale=1.000, learning_rate=6.856e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:13:43,976 (deepspeed_trainer:228) INFO: 35epoch:train:701-800batch: iter_time=1.129e-04, loss_ctc=73.606, loss_att=54.776, acc=0.706, loss=60.434, grad_norm=5.526, loss_scale=1.000, learning_rate=6.855e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:14:16,938 (deepspeed_trainer:228) INFO: 35epoch:train:801-900batch: iter_time=1.082e-04, loss_ctc=66.753, loss_att=46.462, acc=0.730, loss=52.559, grad_norm=5.433, loss_scale=1.000, learning_rate=6.854e-05, step_time=0.329 [2024-12-08 11:14:50,488] [INFO] [logging.py:129:log_dist] [Rank 0] step=511000, skipped=0, lr=[np.float64(6.853221139508479e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:14:50,489] [INFO] [timer.py:264:stop] epoch=0/micro_step=121000/global_step=121000, RunningAvgSamplesPerSec=49.24376922037026, CurrSamplesPerSec=48.37779485475309, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:14:50,492 (deepspeed_trainer:228) INFO: 35epoch:train:901-1000batch: iter_time=1.098e-04, loss_ctc=84.380, loss_att=60.778, acc=0.708, loss=67.866, grad_norm=6.849, loss_scale=1.000, learning_rate=6.854e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 11:15:23,386 (deepspeed_trainer:228) INFO: 35epoch:train:1001-1100batch: iter_time=1.084e-04, loss_ctc=64.096, loss_att=50.508, acc=0.715, loss=54.599, grad_norm=5.095, loss_scale=1.000, learning_rate=6.853e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:15:56,375 (deepspeed_trainer:228) INFO: 35epoch:train:1101-1200batch: iter_time=1.143e-04, loss_ctc=71.319, loss_att=46.864, acc=0.728, loss=54.197, grad_norm=5.987, loss_scale=1.000, learning_rate=6.852e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:16:29,563 (deepspeed_trainer:228) INFO: 35epoch:train:1201-1300batch: iter_time=1.133e-04, loss_ctc=70.158, loss_att=49.586, acc=0.720, loss=55.746, grad_norm=6.213, loss_scale=1.000, learning_rate=6.852e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:17:02,472 (deepspeed_trainer:228) INFO: 35epoch:train:1301-1400batch: iter_time=1.109e-04, loss_ctc=67.546, loss_att=49.539, acc=0.722, loss=54.910, grad_norm=5.317, loss_scale=1.000, learning_rate=6.851e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:17:35,305 (deepspeed_trainer:228) INFO: 35epoch:train:1401-1500batch: iter_time=1.101e-04, loss_ctc=66.446, loss_att=50.380, acc=0.722, loss=55.180, grad_norm=5.881, loss_scale=1.000, learning_rate=6.850e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 11:18:08,316 (deepspeed_trainer:228) INFO: 35epoch:train:1501-1600batch: iter_time=1.172e-04, loss_ctc=68.751, loss_att=53.386, acc=0.722, loss=58.040, grad_norm=5.271, loss_scale=1.000, learning_rate=6.850e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:18:41,382 (deepspeed_trainer:228) INFO: 35epoch:train:1601-1700batch: iter_time=1.129e-04, loss_ctc=73.951, loss_att=57.767, acc=0.713, loss=62.617, grad_norm=6.099, loss_scale=1.000, learning_rate=6.849e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:19:14,240 (deepspeed_trainer:228) INFO: 35epoch:train:1701-1800batch: iter_time=1.072e-04, loss_ctc=67.495, loss_att=48.918, acc=0.727, loss=54.488, grad_norm=5.628, loss_scale=1.000, learning_rate=6.848e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 11:19:43,153 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 11:20:09,297 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 11:20:26,438 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 11:20:26,438 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 11:20:26,441 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 11:20:53,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:53,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:53,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:54,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:54,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:56,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:55,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:55,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:56,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:56,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:57,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:57,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:57,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:57,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:57,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:20:56,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:44,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:44,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:45,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:45,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:45,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:46,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:46,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:47,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:49,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:49,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:50,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:50,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:50,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:50,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:50,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:21:53,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:33,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:34,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:35,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:35,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:35,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:36,219] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:36,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:38,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:38,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:38,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:39,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:40,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:40,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:43,928] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:44,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:22:45,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:21,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:22,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:22,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:22,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:23,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:24,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:24,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:27,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:28,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:29,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:29,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:29,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:29,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:35,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:36,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:23:39,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 11:24:16,269 (deepspeed_trainer:228) INFO: 35epoch:train:1801-1900batch: iter_time=2.603, loss_ctc=69.793, loss_att=48.774, acc=0.729, loss=55.067, grad_norm=5.714, loss_scale=1.000, learning_rate=6.848e-05, step_time=0.417 [2024-12-08 11:24:49,589] [INFO] [logging.py:129:log_dist] [Rank 0] step=512000, skipped=0, lr=[np.float64(6.846525282757995e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:24:49,590] [INFO] [timer.py:264:stop] epoch=0/micro_step=122000/global_step=122000, RunningAvgSamplesPerSec=49.23896464926982, CurrSamplesPerSec=52.06135283031394, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:24:49,593 (deepspeed_trainer:228) INFO: 35epoch:train:1901-2000batch: iter_time=1.223e-04, loss_ctc=70.530, loss_att=53.981, acc=0.728, loss=58.948, grad_norm=5.529, loss_scale=1.000, learning_rate=6.847e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:25:23,125 (deepspeed_trainer:228) INFO: 35epoch:train:2001-2100batch: iter_time=1.121e-04, loss_ctc=76.081, loss_att=52.358, acc=0.727, loss=59.500, grad_norm=6.105, loss_scale=1.000, learning_rate=6.846e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 11:25:56,206 (deepspeed_trainer:228) INFO: 35epoch:train:2101-2200batch: iter_time=1.072e-04, loss_ctc=72.638, loss_att=56.070, acc=0.731, loss=61.038, grad_norm=5.997, loss_scale=1.000, learning_rate=6.846e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:26:29,379 (deepspeed_trainer:228) INFO: 35epoch:train:2201-2300batch: iter_time=1.057e-04, loss_ctc=73.204, loss_att=52.120, acc=0.719, loss=58.423, grad_norm=5.933, loss_scale=1.000, learning_rate=6.845e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:27:02,167 (deepspeed_trainer:228) INFO: 35epoch:train:2301-2400batch: iter_time=1.102e-04, loss_ctc=75.792, loss_att=54.772, acc=0.720, loss=61.091, grad_norm=5.534, loss_scale=1.000, learning_rate=6.844e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 11:27:34,647 (deepspeed_trainer:228) INFO: 35epoch:train:2401-2500batch: iter_time=1.047e-04, loss_ctc=62.397, loss_att=46.971, acc=0.725, loss=51.583, grad_norm=6.088, loss_scale=1.000, learning_rate=6.844e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-08 11:28:07,928 (deepspeed_trainer:228) INFO: 35epoch:train:2501-2600batch: iter_time=1.051e-04, loss_ctc=71.995, loss_att=53.658, acc=0.727, loss=59.166, grad_norm=5.275, loss_scale=1.000, learning_rate=6.843e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:28:41,203 (deepspeed_trainer:228) INFO: 35epoch:train:2601-2700batch: iter_time=1.046e-04, loss_ctc=72.934, loss_att=54.816, acc=0.711, loss=60.272, grad_norm=6.114, loss_scale=1.000, learning_rate=6.842e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:29:14,493 (deepspeed_trainer:228) INFO: 35epoch:train:2701-2800batch: iter_time=1.068e-04, loss_ctc=69.612, loss_att=50.336, acc=0.732, loss=56.132, grad_norm=6.753, loss_scale=1.000, learning_rate=6.842e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:29:48,568 (deepspeed_trainer:228) INFO: 35epoch:train:2801-2900batch: iter_time=1.051e-04, loss_ctc=78.236, loss_att=61.622, acc=0.716, loss=66.601, grad_norm=5.757, loss_scale=1.000, learning_rate=6.841e-05, step_time=0.340 [2024-12-08 11:30:21,714] [INFO] [logging.py:129:log_dist] [Rank 0] step=513000, skipped=0, lr=[np.float64(6.839849014051436e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:30:21,715] [INFO] [timer.py:264:stop] epoch=0/micro_step=123000/global_step=123000, RunningAvgSamplesPerSec=49.24213876152857, CurrSamplesPerSec=48.231208034850795, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:30:21,717 (deepspeed_trainer:228) INFO: 35epoch:train:2901-3000batch: iter_time=1.055e-04, loss_ctc=66.749, loss_att=48.190, acc=0.725, loss=53.763, grad_norm=5.356, loss_scale=1.000, learning_rate=6.840e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:30:55,035 (deepspeed_trainer:228) INFO: 35epoch:train:3001-3100batch: iter_time=1.056e-04, loss_ctc=70.571, loss_att=49.092, acc=0.724, loss=55.550, grad_norm=5.936, loss_scale=1.000, learning_rate=6.840e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:31:27,793 (deepspeed_trainer:228) INFO: 35epoch:train:3101-3200batch: iter_time=1.076e-04, loss_ctc=64.740, loss_att=46.192, acc=0.732, loss=51.777, grad_norm=5.443, loss_scale=1.000, learning_rate=6.839e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 11:32:00,865 (deepspeed_trainer:228) INFO: 35epoch:train:3201-3300batch: iter_time=1.079e-04, loss_ctc=69.080, loss_att=54.842, acc=0.719, loss=59.100, grad_norm=5.590, loss_scale=1.000, learning_rate=6.838e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:32:34,045 (deepspeed_trainer:228) INFO: 35epoch:train:3301-3400batch: iter_time=1.043e-04, loss_ctc=66.263, loss_att=47.002, acc=0.729, loss=52.786, grad_norm=6.148, loss_scale=1.000, learning_rate=6.838e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:33:07,451 (deepspeed_trainer:228) INFO: 35epoch:train:3401-3500batch: iter_time=1.078e-04, loss_ctc=76.047, loss_att=62.233, acc=0.724, loss=66.381, grad_norm=5.628, loss_scale=1.000, learning_rate=6.837e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 11:33:40,159 (deepspeed_trainer:228) INFO: 35epoch:train:3501-3600batch: iter_time=1.050e-04, loss_ctc=59.604, loss_att=46.217, acc=0.719, loss=50.230, grad_norm=6.023, loss_scale=1.000, learning_rate=6.836e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 11:34:13,016 (deepspeed_trainer:228) INFO: 35epoch:train:3601-3700batch: iter_time=1.079e-04, loss_ctc=68.784, loss_att=49.564, acc=0.736, loss=55.344, grad_norm=5.154, loss_scale=1.000, learning_rate=6.836e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 11:34:33,765 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 11:35:00,083 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 11:35:15,716 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 11:35:15,716 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 11:35:15,719 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 11:35:41,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:41,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:41,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:42,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:43,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:43,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:44,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:44,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:44,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:45,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:44,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:45,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:45,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:46,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:46,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:35:46,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:31,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:31,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:31,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:31,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:31,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:33,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:32,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:34,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:33,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:33,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:35,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:35,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:35,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:36,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:36,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:36:36,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:18,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:18,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:19,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:20,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:20,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:20,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:21,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:21,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:24,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:24,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:26,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:27,036] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:27,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:29,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:30,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:37:30,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:06,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:06,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:08,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:09,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:10,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:10,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:11,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:11,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:12,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:13,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:15,231] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:15,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:15,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:20,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:20,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:38:22,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 11:39:04,400 (deepspeed_trainer:228) INFO: 35epoch:train:3701-3800batch: iter_time=2.508, loss_ctc=73.377, loss_att=52.862, acc=0.727, loss=59.048, grad_norm=6.579, loss_scale=1.000, learning_rate=6.835e-05, step_time=0.406 [cnode7-012:0/16] 2024-12-08 11:39:37,592 (deepspeed_trainer:228) INFO: 35epoch:train:3801-3900batch: iter_time=1.057e-04, loss_ctc=73.438, loss_att=53.932, acc=0.724, loss=59.798, grad_norm=5.426, loss_scale=1.000, learning_rate=6.834e-05, step_time=0.331 [2024-12-08 11:40:11,842] [INFO] [logging.py:129:log_dist] [Rank 0] step=514000, skipped=0, lr=[np.float64(6.833192238069986e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:40:11,843] [INFO] [timer.py:264:stop] epoch=0/micro_step=124000/global_step=124000, RunningAvgSamplesPerSec=49.23613655059843, CurrSamplesPerSec=50.44113574473205, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:40:11,846 (deepspeed_trainer:228) INFO: 35epoch:train:3901-4000batch: iter_time=1.079e-04, loss_ctc=73.738, loss_att=52.340, acc=0.730, loss=58.736, grad_norm=5.964, loss_scale=1.000, learning_rate=6.834e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 11:40:44,116 (deepspeed_trainer:228) INFO: 35epoch:train:4001-4100batch: iter_time=1.076e-04, loss_ctc=68.722, loss_att=51.522, acc=0.724, loss=56.692, grad_norm=6.093, loss_scale=1.000, learning_rate=6.833e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:41:17,406 (deepspeed_trainer:228) INFO: 35epoch:train:4101-4200batch: iter_time=1.099e-04, loss_ctc=78.576, loss_att=55.110, acc=0.721, loss=62.142, grad_norm=6.300, loss_scale=1.000, learning_rate=6.832e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:41:50,031 (deepspeed_trainer:228) INFO: 35epoch:train:4201-4300batch: iter_time=1.085e-04, loss_ctc=69.077, loss_att=50.623, acc=0.707, loss=56.150, grad_norm=5.260, loss_scale=1.000, learning_rate=6.832e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 11:42:23,079 (deepspeed_trainer:228) INFO: 35epoch:train:4301-4400batch: iter_time=1.092e-04, loss_ctc=68.021, loss_att=51.491, acc=0.724, loss=56.401, grad_norm=5.474, loss_scale=1.000, learning_rate=6.831e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:42:56,125 (deepspeed_trainer:228) INFO: 35epoch:train:4401-4500batch: iter_time=1.058e-04, loss_ctc=71.957, loss_att=53.960, acc=0.725, loss=59.374, grad_norm=5.191, loss_scale=1.000, learning_rate=6.830e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 11:43:29,048 (deepspeed_trainer:228) INFO: 35epoch:train:4501-4600batch: iter_time=1.088e-04, loss_ctc=67.651, loss_att=47.851, acc=0.718, loss=53.791, grad_norm=5.477, loss_scale=1.000, learning_rate=6.830e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:44:01,989 (deepspeed_trainer:228) INFO: 35epoch:train:4601-4700batch: iter_time=1.068e-04, loss_ctc=71.346, loss_att=49.442, acc=0.733, loss=56.006, grad_norm=5.655, loss_scale=1.000, learning_rate=6.829e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:44:35,161 (deepspeed_trainer:228) INFO: 35epoch:train:4701-4800batch: iter_time=1.079e-04, loss_ctc=75.743, loss_att=60.662, acc=0.707, loss=65.184, grad_norm=6.414, loss_scale=1.000, learning_rate=6.828e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:45:07,915 (deepspeed_trainer:228) INFO: 35epoch:train:4801-4900batch: iter_time=1.082e-04, loss_ctc=67.369, loss_att=46.880, acc=0.730, loss=53.022, grad_norm=5.523, loss_scale=1.000, learning_rate=6.828e-05, step_time=0.327 [2024-12-08 11:45:40,759] [INFO] [logging.py:129:log_dist] [Rank 0] step=515000, skipped=0, lr=[np.float64(6.826554860142945e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:45:40,760] [INFO] [timer.py:264:stop] epoch=0/micro_step=125000/global_step=125000, RunningAvgSamplesPerSec=49.24313535713938, CurrSamplesPerSec=52.31207075116413, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:45:40,763 (deepspeed_trainer:228) INFO: 35epoch:train:4901-5000batch: iter_time=1.086e-04, loss_ctc=69.022, loss_att=47.076, acc=0.728, loss=53.649, grad_norm=6.456, loss_scale=1.000, learning_rate=6.827e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 11:46:15,367 (deepspeed_trainer:228) INFO: 35epoch:train:5001-5100batch: iter_time=1.084e-04, loss_ctc=66.390, loss_att=48.965, acc=0.721, loss=54.190, grad_norm=5.492, loss_scale=1.000, learning_rate=6.826e-05, step_time=0.345 [cnode7-012:0/16] 2024-12-08 11:46:48,228 (deepspeed_trainer:228) INFO: 35epoch:train:5101-5200batch: iter_time=1.075e-04, loss_ctc=64.238, loss_att=50.248, acc=0.718, loss=54.447, grad_norm=4.779, loss_scale=1.000, learning_rate=6.826e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 11:47:21,348 (deepspeed_trainer:228) INFO: 35epoch:train:5201-5300batch: iter_time=1.080e-04, loss_ctc=69.685, loss_att=49.879, acc=0.726, loss=55.793, grad_norm=5.710, loss_scale=1.000, learning_rate=6.825e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:47:54,520 (deepspeed_trainer:228) INFO: 35epoch:train:5301-5400batch: iter_time=1.074e-04, loss_ctc=77.788, loss_att=61.156, acc=0.723, loss=66.154, grad_norm=6.067, loss_scale=1.000, learning_rate=6.824e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:48:27,498 (deepspeed_trainer:228) INFO: 35epoch:train:5401-5500batch: iter_time=1.080e-04, loss_ctc=62.097, loss_att=46.760, acc=0.723, loss=51.374, grad_norm=5.396, loss_scale=1.000, learning_rate=6.824e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:49:00,138 (deepspeed_trainer:228) INFO: 35epoch:train:5501-5600batch: iter_time=1.097e-04, loss_ctc=69.401, loss_att=48.699, acc=0.733, loss=54.928, grad_norm=5.446, loss_scale=1.000, learning_rate=6.823e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 11:49:12,305 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 11:49:40,107 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 11:49:57,799 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 11:49:57,799 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 11:49:57,801 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 11:50:20,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:20,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:21,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:22,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:22,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:22,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:23,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:23,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:23,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:23,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:24,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:24,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:24,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:25,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:25,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:50:26,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:10,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:10,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:10,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:11,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:11,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:12,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:12,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:13,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:14,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:15,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:15,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:16,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:16,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:17,017] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:17,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:19,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:58,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:58,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:51:59,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:01,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:01,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:02,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:02,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:02,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:04,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:05,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:05,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:06,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:06,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:07,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:09,414] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:11,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:47,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:47,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:48,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:49,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:50,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:51,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:50,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:51,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:54,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:54,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:55,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:55,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:52:57,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:53:00,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:53:02,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 11:53:02,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 11:53:54,372 (deepspeed_trainer:228) INFO: 35epoch:train:5601-5700batch: iter_time=2.556, loss_ctc=69.593, loss_att=53.795, acc=0.725, loss=58.540, grad_norm=5.948, loss_scale=1.000, learning_rate=6.822e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-08 11:54:28,238 (deepspeed_trainer:228) INFO: 35epoch:train:5701-5800batch: iter_time=1.249e-04, loss_ctc=76.772, loss_att=53.537, acc=0.734, loss=60.506, grad_norm=6.019, loss_scale=1.000, learning_rate=6.822e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 11:55:01,815 (deepspeed_trainer:228) INFO: 35epoch:train:5801-5900batch: iter_time=1.174e-04, loss_ctc=75.826, loss_att=56.597, acc=0.728, loss=62.352, grad_norm=5.705, loss_scale=1.000, learning_rate=6.821e-05, step_time=0.335 [2024-12-08 11:55:35,091] [INFO] [logging.py:129:log_dist] [Rank 0] step=516000, skipped=0, lr=[np.float64(6.81993678624206e-05)], mom=[[0.9, 0.98]] [2024-12-08 11:55:35,091] [INFO] [timer.py:264:stop] epoch=0/micro_step=126000/global_step=126000, RunningAvgSamplesPerSec=49.23872182811239, CurrSamplesPerSec=51.73298351860783, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 11:55:35,094 (deepspeed_trainer:228) INFO: 35epoch:train:5901-6000batch: iter_time=1.148e-04, loss_ctc=62.129, loss_att=45.558, acc=0.734, loss=50.512, grad_norm=5.101, loss_scale=1.000, learning_rate=6.820e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:56:08,771 (deepspeed_trainer:228) INFO: 35epoch:train:6001-6100batch: iter_time=1.111e-04, loss_ctc=83.746, loss_att=61.244, acc=0.711, loss=68.008, grad_norm=6.404, loss_scale=1.000, learning_rate=6.820e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 11:56:41,840 (deepspeed_trainer:228) INFO: 35epoch:train:6101-6200batch: iter_time=1.131e-04, loss_ctc=64.994, loss_att=45.872, acc=0.732, loss=51.615, grad_norm=5.089, loss_scale=1.000, learning_rate=6.819e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:57:15,224 (deepspeed_trainer:228) INFO: 35epoch:train:6201-6300batch: iter_time=1.113e-04, loss_ctc=71.306, loss_att=56.475, acc=0.720, loss=60.923, grad_norm=5.389, loss_scale=1.000, learning_rate=6.818e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 11:57:48,334 (deepspeed_trainer:228) INFO: 35epoch:train:6301-6400batch: iter_time=1.129e-04, loss_ctc=69.893, loss_att=53.349, acc=0.723, loss=58.300, grad_norm=5.311, loss_scale=1.000, learning_rate=6.818e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 11:58:21,267 (deepspeed_trainer:228) INFO: 35epoch:train:6401-6500batch: iter_time=1.173e-04, loss_ctc=66.800, loss_att=49.177, acc=0.726, loss=54.457, grad_norm=5.591, loss_scale=1.000, learning_rate=6.817e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 11:58:54,476 (deepspeed_trainer:228) INFO: 35epoch:train:6501-6600batch: iter_time=1.130e-04, loss_ctc=75.058, loss_att=55.029, acc=0.726, loss=61.061, grad_norm=5.888, loss_scale=1.000, learning_rate=6.816e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:59:27,717 (deepspeed_trainer:228) INFO: 35epoch:train:6601-6700batch: iter_time=1.208e-04, loss_ctc=72.010, loss_att=57.475, acc=0.721, loss=61.845, grad_norm=5.742, loss_scale=1.000, learning_rate=6.816e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 11:59:59,792 (deepspeed_trainer:228) INFO: 35epoch:train:6701-6800batch: iter_time=1.188e-04, loss_ctc=62.322, loss_att=41.377, acc=0.736, loss=47.656, grad_norm=5.100, loss_scale=1.000, learning_rate=6.815e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:00:32,855 (deepspeed_trainer:228) INFO: 35epoch:train:6801-6900batch: iter_time=1.185e-04, loss_ctc=72.080, loss_att=50.283, acc=0.726, loss=56.823, grad_norm=5.471, loss_scale=1.000, learning_rate=6.814e-05, step_time=0.330 [2024-12-08 12:01:06,036] [INFO] [logging.py:129:log_dist] [Rank 0] step=517000, skipped=0, lr=[np.float64(6.813337922975947e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:01:06,037] [INFO] [timer.py:264:stop] epoch=0/micro_step=127000/global_step=127000, RunningAvgSamplesPerSec=49.24349469569633, CurrSamplesPerSec=46.007415787425174, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:01:06,039 (deepspeed_trainer:228) INFO: 35epoch:train:6901-7000batch: iter_time=1.141e-04, loss_ctc=68.086, loss_att=51.588, acc=0.729, loss=56.541, grad_norm=5.345, loss_scale=1.000, learning_rate=6.814e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:01:39,097 (deepspeed_trainer:228) INFO: 35epoch:train:7001-7100batch: iter_time=1.087e-04, loss_ctc=66.635, loss_att=51.932, acc=0.726, loss=56.337, grad_norm=4.940, loss_scale=1.000, learning_rate=6.813e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:02:11,968 (deepspeed_trainer:228) INFO: 35epoch:train:7101-7200batch: iter_time=1.066e-04, loss_ctc=66.377, loss_att=48.402, acc=0.737, loss=53.795, grad_norm=5.556, loss_scale=1.000, learning_rate=6.812e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 12:02:44,919 (deepspeed_trainer:228) INFO: 35epoch:train:7201-7300batch: iter_time=1.084e-04, loss_ctc=71.218, loss_att=58.748, acc=0.718, loss=62.498, grad_norm=6.101, loss_scale=1.000, learning_rate=6.812e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:03:17,597 (deepspeed_trainer:228) INFO: 35epoch:train:7301-7400batch: iter_time=1.129e-04, loss_ctc=62.897, loss_att=48.815, acc=0.723, loss=53.041, grad_norm=5.492, loss_scale=1.000, learning_rate=6.811e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 12:03:50,498 (deepspeed_trainer:228) INFO: 35epoch:train:7401-7500batch: iter_time=1.128e-04, loss_ctc=71.457, loss_att=50.358, acc=0.738, loss=56.693, grad_norm=5.114, loss_scale=1.000, learning_rate=6.810e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:03:54,320 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 12:04:21,691 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 12:04:38,132 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 12:04:38,132 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 12:04:38,135 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 12:05:03,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:04,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:04,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:05,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:05,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:06,034] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:06,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:06,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,320] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:10,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:52,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:53,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:54,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:55,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:55,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:56,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:57,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:57,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:59,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:05:59,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:00,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:00,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:01,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:01,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:01,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:01,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:41,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:42,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:43,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:43,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:45,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:45,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:45,600] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:46,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:51,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:51,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:51,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:52,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:52,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:53,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:53,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:06:56,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:30,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:32,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:32,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:32,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:33,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:34,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:35,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:35,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:42,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:42,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:43,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:43,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:43,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:45,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:46,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:07:50,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 12:08:50,900 (deepspeed_trainer:228) INFO: 35epoch:train:7501-7600batch: iter_time=2.617, loss_ctc=68.049, loss_att=51.339, acc=0.721, loss=56.351, grad_norm=5.087, loss_scale=1.000, learning_rate=6.810e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-08 12:09:24,111 (deepspeed_trainer:228) INFO: 35epoch:train:7601-7700batch: iter_time=1.080e-04, loss_ctc=77.343, loss_att=55.693, acc=0.732, loss=62.177, grad_norm=5.345, loss_scale=1.000, learning_rate=6.809e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:09:57,686 (deepspeed_trainer:228) INFO: 35epoch:train:7701-7800batch: iter_time=1.135e-04, loss_ctc=72.647, loss_att=53.399, acc=0.728, loss=59.182, grad_norm=5.337, loss_scale=1.000, learning_rate=6.808e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:10:30,817 (deepspeed_trainer:228) INFO: 35epoch:train:7801-7900batch: iter_time=1.163e-04, loss_ctc=66.224, loss_att=47.601, acc=0.727, loss=53.190, grad_norm=5.115, loss_scale=1.000, learning_rate=6.808e-05, step_time=0.331 [2024-12-08 12:11:03,935] [INFO] [logging.py:129:log_dist] [Rank 0] step=518000, skipped=0, lr=[np.float64(6.806758177584547e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:11:03,935] [INFO] [timer.py:264:stop] epoch=0/micro_step=128000/global_step=128000, RunningAvgSamplesPerSec=49.24209026017414, CurrSamplesPerSec=50.69793317911067, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:11:03,938 (deepspeed_trainer:228) INFO: 35epoch:train:7901-8000batch: iter_time=1.119e-04, loss_ctc=81.791, loss_att=60.006, acc=0.705, loss=66.545, grad_norm=6.057, loss_scale=1.000, learning_rate=6.807e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:11:36,814 (deepspeed_trainer:228) INFO: 35epoch:train:8001-8100batch: iter_time=1.128e-04, loss_ctc=62.738, loss_att=45.886, acc=0.727, loss=50.915, grad_norm=4.899, loss_scale=1.000, learning_rate=6.806e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 12:12:09,890 (deepspeed_trainer:228) INFO: 35epoch:train:8101-8200batch: iter_time=1.140e-04, loss_ctc=70.889, loss_att=53.945, acc=0.725, loss=59.007, grad_norm=5.203, loss_scale=1.000, learning_rate=6.806e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:12:43,459 (deepspeed_trainer:228) INFO: 35epoch:train:8201-8300batch: iter_time=1.145e-04, loss_ctc=72.375, loss_att=53.773, acc=0.711, loss=59.367, grad_norm=5.784, loss_scale=1.000, learning_rate=6.805e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:13:16,723 (deepspeed_trainer:228) INFO: 35epoch:train:8301-8400batch: iter_time=1.115e-04, loss_ctc=65.386, loss_att=45.157, acc=0.736, loss=51.208, grad_norm=5.183, loss_scale=1.000, learning_rate=6.804e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:13:49,925 (deepspeed_trainer:228) INFO: 35epoch:train:8401-8500batch: iter_time=1.108e-04, loss_ctc=79.375, loss_att=60.133, acc=0.711, loss=65.926, grad_norm=6.333, loss_scale=1.000, learning_rate=6.804e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:14:22,982 (deepspeed_trainer:228) INFO: 35epoch:train:8501-8600batch: iter_time=1.118e-04, loss_ctc=62.951, loss_att=50.023, acc=0.718, loss=53.894, grad_norm=5.105, loss_scale=1.000, learning_rate=6.803e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:14:56,060 (deepspeed_trainer:228) INFO: 35epoch:train:8601-8700batch: iter_time=1.131e-04, loss_ctc=70.036, loss_att=46.254, acc=0.733, loss=53.403, grad_norm=5.948, loss_scale=1.000, learning_rate=6.802e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:15:29,082 (deepspeed_trainer:228) INFO: 35epoch:train:8701-8800batch: iter_time=1.125e-04, loss_ctc=68.660, loss_att=49.597, acc=0.720, loss=55.330, grad_norm=5.711, loss_scale=1.000, learning_rate=6.802e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:16:02,533 (deepspeed_trainer:228) INFO: 35epoch:train:8801-8900batch: iter_time=1.124e-04, loss_ctc=66.496, loss_att=48.180, acc=0.728, loss=53.687, grad_norm=4.858, loss_scale=1.000, learning_rate=6.801e-05, step_time=0.334 [2024-12-08 12:16:35,387] [INFO] [logging.py:129:log_dist] [Rank 0] step=519000, skipped=0, lr=[np.float64(6.800197457933655e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:16:35,388] [INFO] [timer.py:264:stop] epoch=0/micro_step=129000/global_step=129000, RunningAvgSamplesPerSec=49.24665517960556, CurrSamplesPerSec=50.93299370101617, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:16:35,390 (deepspeed_trainer:228) INFO: 35epoch:train:8901-9000batch: iter_time=1.126e-04, loss_ctc=65.093, loss_att=49.702, acc=0.725, loss=54.342, grad_norm=5.191, loss_scale=1.000, learning_rate=6.801e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 12:17:08,282 (deepspeed_trainer:228) INFO: 35epoch:train:9001-9100batch: iter_time=1.120e-04, loss_ctc=68.329, loss_att=52.312, acc=0.729, loss=57.093, grad_norm=5.460, loss_scale=1.000, learning_rate=6.800e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:17:41,325 (deepspeed_trainer:228) INFO: 35epoch:train:9101-9200batch: iter_time=1.127e-04, loss_ctc=70.947, loss_att=56.018, acc=0.721, loss=60.525, grad_norm=6.320, loss_scale=1.000, learning_rate=6.799e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:18:14,170 (deepspeed_trainer:228) INFO: 35epoch:train:9201-9300batch: iter_time=1.117e-04, loss_ctc=66.084, loss_att=48.043, acc=0.731, loss=53.436, grad_norm=5.166, loss_scale=1.000, learning_rate=6.799e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 12:18:43,465 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 12:19:10,780 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 12:19:27,690 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 12:19:27,690 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 12:19:27,693 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 12:19:52,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:52,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:53,157] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:53,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:54,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:55,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:55,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:55,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:55,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:56,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:56,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:56,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:57,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:57,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:57,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:19:58,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:41,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:41,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:41,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:42,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:44,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:44,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:45,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:45,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:45,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:46,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:46,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:48,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:48,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:49,108] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:49,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:20:50,637] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:30,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:30,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:31,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:31,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:33,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:34,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:34,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:34,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:34,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:35,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:36,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:36,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:37,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:37,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:38,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:21:42,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:18,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:19,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:19,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:21,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:22,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:23,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:23,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:23,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:23,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:23,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:25,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:26,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:26,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:27,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:27,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:22:33,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 12:23:08,202 (deepspeed_trainer:228) INFO: 35epoch:train:9301-9400batch: iter_time=2.540, loss_ctc=68.464, loss_att=48.118, acc=0.733, loss=54.227, grad_norm=5.057, loss_scale=1.000, learning_rate=6.798e-05, step_time=0.394 [cnode7-012:0/16] 2024-12-08 12:23:41,721 (deepspeed_trainer:228) INFO: 35epoch:train:9401-9500batch: iter_time=1.072e-04, loss_ctc=69.241, loss_att=52.598, acc=0.732, loss=57.600, grad_norm=4.777, loss_scale=1.000, learning_rate=6.797e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:24:15,653 (deepspeed_trainer:228) INFO: 35epoch:train:9501-9600batch: iter_time=1.142e-04, loss_ctc=75.304, loss_att=52.352, acc=0.732, loss=59.250, grad_norm=6.565, loss_scale=1.000, learning_rate=6.797e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 12:24:48,916 (deepspeed_trainer:228) INFO: 35epoch:train:9601-9700batch: iter_time=1.113e-04, loss_ctc=72.543, loss_att=56.145, acc=0.733, loss=61.074, grad_norm=6.135, loss_scale=1.000, learning_rate=6.796e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:25:22,268 (deepspeed_trainer:228) INFO: 35epoch:train:9701-9800batch: iter_time=1.086e-04, loss_ctc=72.424, loss_att=51.366, acc=0.722, loss=57.706, grad_norm=5.268, loss_scale=1.000, learning_rate=6.795e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 12:25:56,057 (deepspeed_trainer:228) INFO: 35epoch:train:9801-9900batch: iter_time=1.082e-04, loss_ctc=75.379, loss_att=54.905, acc=0.722, loss=61.074, grad_norm=5.794, loss_scale=1.000, learning_rate=6.795e-05, step_time=0.338 [2024-12-08 12:26:29,574] [INFO] [logging.py:129:log_dist] [Rank 0] step=520000, skipped=0, lr=[np.float64(6.793655672509492e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:26:29,575] [INFO] [timer.py:264:stop] epoch=0/micro_step=130000/global_step=130000, RunningAvgSamplesPerSec=49.24131508930316, CurrSamplesPerSec=45.3512366130633, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:26:29,577 (deepspeed_trainer:228) INFO: 35epoch:train:9901-10000batch: iter_time=1.090e-04, loss_ctc=61.619, loss_att=46.124, acc=0.729, loss=50.760, grad_norm=4.740, loss_scale=1.000, learning_rate=6.794e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:27:04,172 (deepspeed_trainer:228) INFO: 35epoch:train:10001-10100batch: iter_time=1.137e-04, loss_ctc=70.621, loss_att=53.786, acc=0.730, loss=58.831, grad_norm=5.340, loss_scale=1.000, learning_rate=6.793e-05, step_time=0.346 [cnode7-012:0/16] 2024-12-08 12:27:37,505 (deepspeed_trainer:228) INFO: 35epoch:train:10101-10200batch: iter_time=1.120e-04, loss_ctc=71.992, loss_att=53.916, acc=0.715, loss=59.324, grad_norm=5.802, loss_scale=1.000, learning_rate=6.793e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 12:28:10,939 (deepspeed_trainer:228) INFO: 35epoch:train:10201-10300batch: iter_time=1.111e-04, loss_ctc=68.557, loss_att=49.484, acc=0.734, loss=55.196, grad_norm=5.301, loss_scale=1.000, learning_rate=6.792e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 12:28:44,794 (deepspeed_trainer:228) INFO: 35epoch:train:10301-10400batch: iter_time=1.093e-04, loss_ctc=76.556, loss_att=61.018, acc=0.718, loss=65.656, grad_norm=5.715, loss_scale=1.000, learning_rate=6.791e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 12:29:18,279 (deepspeed_trainer:228) INFO: 35epoch:train:10401-10500batch: iter_time=1.117e-04, loss_ctc=65.516, loss_att=47.280, acc=0.729, loss=52.740, grad_norm=5.507, loss_scale=1.000, learning_rate=6.791e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:29:52,057 (deepspeed_trainer:228) INFO: 35epoch:train:10501-10600batch: iter_time=1.108e-04, loss_ctc=70.361, loss_att=49.068, acc=0.726, loss=55.463, grad_norm=5.830, loss_scale=1.000, learning_rate=6.790e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 12:30:25,259 (deepspeed_trainer:228) INFO: 35epoch:train:10601-10700batch: iter_time=1.088e-04, loss_ctc=64.126, loss_att=45.871, acc=0.735, loss=51.355, grad_norm=5.053, loss_scale=1.000, learning_rate=6.789e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:30:58,740 (deepspeed_trainer:228) INFO: 35epoch:train:10701-10800batch: iter_time=1.093e-04, loss_ctc=68.102, loss_att=53.894, acc=0.722, loss=58.154, grad_norm=5.044, loss_scale=1.000, learning_rate=6.789e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:31:32,015 (deepspeed_trainer:228) INFO: 35epoch:train:10801-10900batch: iter_time=1.121e-04, loss_ctc=65.668, loss_att=46.947, acc=0.732, loss=52.539, grad_norm=5.722, loss_scale=1.000, learning_rate=6.788e-05, step_time=0.332 [2024-12-08 12:32:05,423] [INFO] [logging.py:129:log_dist] [Rank 0] step=521000, skipped=0, lr=[np.float64(6.78713273041335e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:32:05,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=131000/global_step=131000, RunningAvgSamplesPerSec=49.240561976785784, CurrSamplesPerSec=52.43649365014513, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:32:05,426 (deepspeed_trainer:228) INFO: 35epoch:train:10901-11000batch: iter_time=1.099e-04, loss_ctc=75.529, loss_att=62.016, acc=0.726, loss=66.077, grad_norm=6.300, loss_scale=1.000, learning_rate=6.787e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 12:32:38,242 (deepspeed_trainer:228) INFO: 35epoch:train:11001-11100batch: iter_time=1.104e-04, loss_ctc=58.405, loss_att=45.767, acc=0.723, loss=49.559, grad_norm=5.732, loss_scale=1.000, learning_rate=6.787e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 12:33:11,310 (deepspeed_trainer:228) INFO: 35epoch:train:11101-11200batch: iter_time=1.104e-04, loss_ctc=69.396, loss_att=49.577, acc=0.737, loss=55.534, grad_norm=5.489, loss_scale=1.000, learning_rate=6.786e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:33:31,609 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 12:33:58,314 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 12:34:15,757 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 12:34:15,757 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 12:34:15,759 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 12:34:41,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:42,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:42,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:42,890] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:43,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:43,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:43,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:43,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:44,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:44,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:44,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:44,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:44,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:45,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:46,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:34:46,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:31,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:32,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:32,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:32,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:32,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:33,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:33,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:33,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:33,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:33,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:33,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:34,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:34,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:35,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:35,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:35:36,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:21,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:22,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:22,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:22,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:23,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:23,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:23,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:23,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:23,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:24,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:24,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:25,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:25,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:25,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:26,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:36:26,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:10,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:10,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:11,383] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:11,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:12,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:12,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:12,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:12,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:13,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:15,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:15,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:15,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:16,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:16,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:16,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:37:18,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 12:38:02,494 (deepspeed_trainer:228) INFO: 35epoch:train:11201-11300batch: iter_time=2.556, loss_ctc=72.692, loss_att=51.650, acc=0.735, loss=57.977, grad_norm=5.786, loss_scale=1.000, learning_rate=6.786e-05, step_time=0.355 [cnode7-012:0/16] 2024-12-08 12:38:35,772 (deepspeed_trainer:228) INFO: 35epoch:train:11301-11400batch: iter_time=1.140e-04, loss_ctc=72.520, loss_att=53.102, acc=0.731, loss=58.960, grad_norm=5.374, loss_scale=1.000, learning_rate=6.785e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:39:09,132 (deepspeed_trainer:228) INFO: 35epoch:train:11401-11500batch: iter_time=1.147e-04, loss_ctc=73.172, loss_att=52.585, acc=0.733, loss=58.768, grad_norm=6.093, loss_scale=1.000, learning_rate=6.784e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 12:39:42,195 (deepspeed_trainer:228) INFO: 35epoch:train:11501-11600batch: iter_time=1.157e-04, loss_ctc=68.582, loss_att=51.613, acc=0.729, loss=56.702, grad_norm=5.676, loss_scale=1.000, learning_rate=6.784e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:40:15,501 (deepspeed_trainer:228) INFO: 35epoch:train:11601-11700batch: iter_time=1.157e-04, loss_ctc=78.216, loss_att=55.515, acc=0.727, loss=62.329, grad_norm=5.832, loss_scale=1.000, learning_rate=6.783e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 12:40:48,461 (deepspeed_trainer:228) INFO: 35epoch:train:11701-11800batch: iter_time=1.139e-04, loss_ctc=68.009, loss_att=50.377, acc=0.718, loss=55.648, grad_norm=6.326, loss_scale=1.000, learning_rate=6.782e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:41:21,564 (deepspeed_trainer:228) INFO: 35epoch:train:11801-11900batch: iter_time=1.157e-04, loss_ctc=66.503, loss_att=51.784, acc=0.727, loss=56.213, grad_norm=5.078, loss_scale=1.000, learning_rate=6.782e-05, step_time=0.330 [2024-12-08 12:41:55,056] [INFO] [logging.py:129:log_dist] [Rank 0] step=522000, skipped=0, lr=[np.float64(6.78062854135629e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:41:55,056] [INFO] [timer.py:264:stop] epoch=0/micro_step=132000/global_step=132000, RunningAvgSamplesPerSec=49.241381261047636, CurrSamplesPerSec=50.407415601731444, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:41:55,059 (deepspeed_trainer:228) INFO: 35epoch:train:11901-12000batch: iter_time=1.139e-04, loss_ctc=71.451, loss_att=53.829, acc=0.730, loss=59.126, grad_norm=5.292, loss_scale=1.000, learning_rate=6.781e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 12:42:28,323 (deepspeed_trainer:228) INFO: 35epoch:train:12001-12100batch: iter_time=1.159e-04, loss_ctc=67.716, loss_att=48.601, acc=0.721, loss=54.346, grad_norm=5.741, loss_scale=1.000, learning_rate=6.780e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:43:01,446 (deepspeed_trainer:228) INFO: 35epoch:train:12101-12200batch: iter_time=1.136e-04, loss_ctc=71.292, loss_att=50.998, acc=0.734, loss=57.108, grad_norm=6.088, loss_scale=1.000, learning_rate=6.780e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:43:35,059 (deepspeed_trainer:228) INFO: 35epoch:train:12201-12300batch: iter_time=1.075e-04, loss_ctc=74.726, loss_att=60.215, acc=0.716, loss=64.544, grad_norm=6.053, loss_scale=1.000, learning_rate=6.779e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 12:44:08,001 (deepspeed_trainer:228) INFO: 35epoch:train:12301-12400batch: iter_time=1.063e-04, loss_ctc=66.445, loss_att=46.812, acc=0.733, loss=52.663, grad_norm=5.543, loss_scale=1.000, learning_rate=6.778e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:44:41,238 (deepspeed_trainer:228) INFO: 35epoch:train:12401-12500batch: iter_time=1.089e-04, loss_ctc=68.066, loss_att=48.300, acc=0.729, loss=54.226, grad_norm=5.257, loss_scale=1.000, learning_rate=6.778e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:45:14,382 (deepspeed_trainer:228) INFO: 35epoch:train:12501-12600batch: iter_time=1.073e-04, loss_ctc=65.197, loss_att=48.532, acc=0.729, loss=53.546, grad_norm=5.494, loss_scale=1.000, learning_rate=6.777e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:45:47,611 (deepspeed_trainer:228) INFO: 35epoch:train:12601-12700batch: iter_time=1.103e-04, loss_ctc=64.576, loss_att=50.273, acc=0.726, loss=54.577, grad_norm=4.838, loss_scale=1.000, learning_rate=6.776e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:46:20,944 (deepspeed_trainer:228) INFO: 35epoch:train:12701-12800batch: iter_time=1.100e-04, loss_ctc=69.094, loss_att=48.458, acc=0.735, loss=54.656, grad_norm=6.414, loss_scale=1.000, learning_rate=6.776e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 12:46:54,358 (deepspeed_trainer:228) INFO: 35epoch:train:12801-12900batch: iter_time=1.107e-04, loss_ctc=76.983, loss_att=61.880, acc=0.728, loss=66.421, grad_norm=6.405, loss_scale=1.000, learning_rate=6.775e-05, step_time=0.334 [2024-12-08 12:47:27,432] [INFO] [logging.py:129:log_dist] [Rank 0] step=523000, skipped=0, lr=[np.float64(6.774143015653886e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:47:27,432] [INFO] [timer.py:264:stop] epoch=0/micro_step=133000/global_step=133000, RunningAvgSamplesPerSec=49.244090073319036, CurrSamplesPerSec=47.03669700209123, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:47:27,435 (deepspeed_trainer:228) INFO: 35epoch:train:12901-13000batch: iter_time=1.127e-04, loss_ctc=61.388, loss_att=47.002, acc=0.726, loss=51.294, grad_norm=6.391, loss_scale=1.000, learning_rate=6.774e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:48:00,378 (deepspeed_trainer:228) INFO: 35epoch:train:13001-13100batch: iter_time=1.068e-04, loss_ctc=68.693, loss_att=48.598, acc=0.737, loss=54.631, grad_norm=5.207, loss_scale=1.000, learning_rate=6.774e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:48:12,473 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 12:48:41,047 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 12:48:57,426 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 12:48:57,426 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 12:48:57,429 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 12:49:22,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:23,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:23,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:23,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:24,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:24,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:24,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:24,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:25,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:25,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:25,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:26,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:26,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:26,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:26,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:49:27,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:12,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:12,594] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:13,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:13,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:13,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:13,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:13,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:14,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:14,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:14,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:14,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:15,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:16,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:18,394] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:19,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:50:20,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:01,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:02,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:03,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:03,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:03,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:04,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:04,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:04,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:04,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:04,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:05,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:05,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:07,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:10,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:13,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:13,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:50,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:51,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:52,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:52,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:53,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:53,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:53,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:54,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:54,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:54,874] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:56,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:56,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:58,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:51:58,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:52:03,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 12:52:04,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 12:52:54,244 (deepspeed_trainer:228) INFO: 35epoch:train:13101-13200batch: iter_time=2.517, loss_ctc=68.659, loss_att=54.405, acc=0.721, loss=58.682, grad_norm=5.318, loss_scale=1.000, learning_rate=6.773e-05, step_time=0.422 [cnode7-012:0/16] 2024-12-08 12:53:27,361 (deepspeed_trainer:228) INFO: 35epoch:train:13201-13300batch: iter_time=1.058e-04, loss_ctc=76.072, loss_att=53.435, acc=0.731, loss=60.221, grad_norm=6.016, loss_scale=1.000, learning_rate=6.773e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:54:00,628 (deepspeed_trainer:228) INFO: 35epoch:train:13301-13400batch: iter_time=1.103e-04, loss_ctc=75.203, loss_att=56.248, acc=0.728, loss=61.958, grad_norm=5.425, loss_scale=1.000, learning_rate=6.772e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:54:34,153 (deepspeed_trainer:228) INFO: 35epoch:train:13401-13500batch: iter_time=1.116e-04, loss_ctc=61.211, loss_att=44.900, acc=0.734, loss=49.798, grad_norm=5.086, loss_scale=1.000, learning_rate=6.771e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 12:55:07,259 (deepspeed_trainer:228) INFO: 35epoch:train:13501-13600batch: iter_time=1.096e-04, loss_ctc=82.636, loss_att=60.679, acc=0.707, loss=67.261, grad_norm=6.198, loss_scale=1.000, learning_rate=6.771e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 12:55:40,468 (deepspeed_trainer:228) INFO: 35epoch:train:13601-13700batch: iter_time=1.107e-04, loss_ctc=64.515, loss_att=45.267, acc=0.728, loss=51.046, grad_norm=5.437, loss_scale=1.000, learning_rate=6.770e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 12:56:13,381 (deepspeed_trainer:228) INFO: 35epoch:train:13701-13800batch: iter_time=1.144e-04, loss_ctc=70.155, loss_att=55.398, acc=0.719, loss=59.819, grad_norm=5.044, loss_scale=1.000, learning_rate=6.769e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 12:56:46,192 (deepspeed_trainer:228) INFO: 35epoch:train:13801-13900batch: iter_time=1.107e-04, loss_ctc=69.208, loss_att=53.137, acc=0.721, loss=57.941, grad_norm=5.244, loss_scale=1.000, learning_rate=6.769e-05, step_time=0.328 [2024-12-08 12:57:18,884] [INFO] [logging.py:129:log_dist] [Rank 0] step=524000, skipped=0, lr=[np.float64(6.767676064221036e-05)], mom=[[0.9, 0.98]] [2024-12-08 12:57:18,885] [INFO] [timer.py:264:stop] epoch=0/micro_step=134000/global_step=134000, RunningAvgSamplesPerSec=49.23882548494649, CurrSamplesPerSec=51.523784946692196, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 12:57:18,887 (deepspeed_trainer:228) INFO: 35epoch:train:13901-14000batch: iter_time=1.110e-04, loss_ctc=66.194, loss_att=45.928, acc=0.730, loss=52.029, grad_norm=5.472, loss_scale=1.000, learning_rate=6.768e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 12:57:51,793 (deepspeed_trainer:228) INFO: 35epoch:train:14001-14100batch: iter_time=1.098e-04, loss_ctc=75.495, loss_att=55.303, acc=0.719, loss=61.369, grad_norm=5.971, loss_scale=1.000, learning_rate=6.767e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 12:58:24,838 (deepspeed_trainer:228) INFO: 35epoch:train:14101-14200batch: iter_time=1.207e-04, loss_ctc=71.088, loss_att=57.013, acc=0.717, loss=61.208, grad_norm=5.556, loss_scale=1.000, learning_rate=6.767e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 12:58:57,222 (deepspeed_trainer:228) INFO: 35epoch:train:14201-14300batch: iter_time=1.091e-04, loss_ctc=62.834, loss_att=41.796, acc=0.733, loss=48.105, grad_norm=5.461, loss_scale=1.000, learning_rate=6.766e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-08 12:59:29,889 (deepspeed_trainer:228) INFO: 35epoch:train:14301-14400batch: iter_time=1.105e-04, loss_ctc=71.602, loss_att=48.407, acc=0.729, loss=55.374, grad_norm=6.131, loss_scale=1.000, learning_rate=6.765e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 13:00:02,519 (deepspeed_trainer:228) INFO: 35epoch:train:14401-14500batch: iter_time=1.096e-04, loss_ctc=67.316, loss_att=50.955, acc=0.724, loss=55.872, grad_norm=5.017, loss_scale=1.000, learning_rate=6.765e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 13:00:35,150 (deepspeed_trainer:228) INFO: 35epoch:train:14501-14600batch: iter_time=1.107e-04, loss_ctc=65.980, loss_att=51.911, acc=0.719, loss=56.112, grad_norm=5.270, loss_scale=1.000, learning_rate=6.764e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 13:01:07,911 (deepspeed_trainer:228) INFO: 35epoch:train:14601-14700batch: iter_time=1.098e-04, loss_ctc=66.441, loss_att=47.979, acc=0.735, loss=53.484, grad_norm=5.434, loss_scale=1.000, learning_rate=6.763e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 13:01:41,051 (deepspeed_trainer:228) INFO: 35epoch:train:14701-14800batch: iter_time=1.108e-04, loss_ctc=70.714, loss_att=57.112, acc=0.719, loss=61.203, grad_norm=6.092, loss_scale=1.000, learning_rate=6.763e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:02:13,673 (deepspeed_trainer:228) INFO: 35epoch:train:14801-14900batch: iter_time=1.089e-04, loss_ctc=61.989, loss_att=47.256, acc=0.727, loss=51.654, grad_norm=5.111, loss_scale=1.000, learning_rate=6.762e-05, step_time=0.326 [2024-12-08 13:02:46,392] [INFO] [logging.py:129:log_dist] [Rank 0] step=525000, skipped=0, lr=[np.float64(6.761227598566819e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:02:46,393] [INFO] [timer.py:264:stop] epoch=0/micro_step=135000/global_step=135000, RunningAvgSamplesPerSec=49.24717942059647, CurrSamplesPerSec=53.666217432613635, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:02:46,395 (deepspeed_trainer:228) INFO: 35epoch:train:14901-15000batch: iter_time=1.089e-04, loss_ctc=72.334, loss_att=48.984, acc=0.737, loss=55.995, grad_norm=5.101, loss_scale=1.000, learning_rate=6.762e-05, step_time=0.327 [2024-12-08 13:03:01,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:03,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:03,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:02,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:03,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:03,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:03,251] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:17,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:17,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:18,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:18,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:18,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:18,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:20,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:19,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:20,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:20,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:32,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:33,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:33,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:34,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:34,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:34,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:34,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:35,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:35,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:35,124] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:35,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:35,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:36,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:36,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:36,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:36,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:47,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:49,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:49,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:49,695] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:49,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:49,972] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:50,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:50,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:50,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:50,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:51,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:51,534] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:51,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:51,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:51,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:03:51,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:04:04,681] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 35 is about to be saved! [2024-12-08 13:04:04,715] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/mp_rank_00_model_states.pt [2024-12-08 13:04:04,715] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/mp_rank_00_model_states.pt... [2024-12-08 13:04:06,919] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/mp_rank_00_model_states.pt. [2024-12-08 13:04:06,737] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,077] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,739] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,077] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,740] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,740] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,740] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,741] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,081] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,081] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,082] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,745] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,082] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,082] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,083] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 13:04:06,746] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 13:04:07,459] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,459] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,459] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,812] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,812] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,812] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,832] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,512] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,512] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,512] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,520] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,521] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,521] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,527] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,528] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,528] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,872] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,872] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,537] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,537] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,537] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,876] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,876] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,876] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,878] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,878] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,878] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,542] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,542] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,542] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,564] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,564] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,565] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,581] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,581] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,581] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,966] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,966] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,966] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,968] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,969] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,969] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,986] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,987] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,987] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [2024-12-08 13:04:07,990] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 13:04:07,990] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_35/35/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 13:04:07,990] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 35 is ready now! [cnode7-012:0/16] 2024-12-08 13:04:08,059 (deepspeed_trainer:158) INFO: 35epoch results: [train] iter_time=0.135, loss_ctc=70.059, loss_att=51.744, acc=0.725, loss=57.239, grad_norm=5.623, loss_scale=1.000, learning_rate=6.810e-05, step_time=0.335, time=1 hour, 57 minutes and 45.11 seconds, total_count=525035, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.672, cer_ctc=0.067, loss_att=8.188, acc=0.787, cer=0.460, wer=1.000, loss=6.844, time=1 minute and 9.05 seconds, total_count=35, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 13:04:10,360 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 13:04:36,694 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 13:04:51,712 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 13:04:51,712 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 13:04:51,715 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 13:05:13,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:13,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:14,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:15,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:15,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:15,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:15,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:16,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:16,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:16,802] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:17,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:17,569] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:17,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:18,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:18,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:05:19,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:03,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:03,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:04,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:04,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:04,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:04,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:04,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:05,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:05,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:05,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:06,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:07,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:07,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:08,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:11,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:11,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:51,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:53,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:53,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:53,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:54,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:54,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:54,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:54,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:55,528] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:56,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:55,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:56,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:56,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:06:57,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:00,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:02,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:40,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:42,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:42,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:43,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:43,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:44,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:44,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:44,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:44,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:45,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:45,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:46,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:47,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:47,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:49,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:07:50,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 13:08:49,940 (deepspeed_trainer:228) INFO: 36epoch:train:1-100batch: iter_time=2.411, loss_ctc=76.214, loss_att=51.348, acc=0.713, loss=58.796, grad_norm=6.159, loss_scale=1.000, learning_rate=6.761e-05, step_time=0.385 [cnode7-012:0/16] 2024-12-08 13:09:23,263 (deepspeed_trainer:228) INFO: 36epoch:train:101-200batch: iter_time=1.076e-04, loss_ctc=70.812, loss_att=49.999, acc=0.724, loss=56.252, grad_norm=5.184, loss_scale=1.000, learning_rate=6.760e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 13:09:56,272 (deepspeed_trainer:228) INFO: 36epoch:train:201-300batch: iter_time=1.082e-04, loss_ctc=62.050, loss_att=44.630, acc=0.721, loss=49.866, grad_norm=5.469, loss_scale=1.000, learning_rate=6.760e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:10:29,247 (deepspeed_trainer:228) INFO: 36epoch:train:301-400batch: iter_time=1.078e-04, loss_ctc=66.784, loss_att=51.363, acc=0.717, loss=55.979, grad_norm=5.478, loss_scale=1.000, learning_rate=6.759e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:11:02,326 (deepspeed_trainer:228) INFO: 36epoch:train:401-500batch: iter_time=1.099e-04, loss_ctc=81.138, loss_att=54.184, acc=0.730, loss=62.267, grad_norm=5.738, loss_scale=1.000, learning_rate=6.758e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:11:35,303 (deepspeed_trainer:228) INFO: 36epoch:train:501-600batch: iter_time=1.095e-04, loss_ctc=70.611, loss_att=52.851, acc=0.713, loss=58.210, grad_norm=6.338, loss_scale=1.000, learning_rate=6.758e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:12:08,156 (deepspeed_trainer:228) INFO: 36epoch:train:601-700batch: iter_time=1.093e-04, loss_ctc=68.918, loss_att=48.450, acc=0.726, loss=54.588, grad_norm=5.726, loss_scale=1.000, learning_rate=6.757e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 13:12:41,184 (deepspeed_trainer:228) INFO: 36epoch:train:701-800batch: iter_time=1.184e-04, loss_ctc=73.953, loss_att=58.129, acc=0.709, loss=62.842, grad_norm=5.731, loss_scale=1.000, learning_rate=6.756e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:13:14,062 (deepspeed_trainer:228) INFO: 36epoch:train:801-900batch: iter_time=1.063e-04, loss_ctc=68.809, loss_att=52.426, acc=0.712, loss=57.354, grad_norm=5.421, loss_scale=1.000, learning_rate=6.756e-05, step_time=0.329 [2024-12-08 13:13:47,310] [INFO] [logging.py:129:log_dist] [Rank 0] step=526000, skipped=0, lr=[np.float64(6.754797530789415e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:13:47,311] [INFO] [timer.py:264:stop] epoch=0/micro_step=136000/global_step=136000, RunningAvgSamplesPerSec=49.246165204770186, CurrSamplesPerSec=51.10485703960055, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:13:47,315 (deepspeed_trainer:228) INFO: 36epoch:train:901-1000batch: iter_time=1.105e-04, loss_ctc=77.150, loss_att=57.952, acc=0.720, loss=63.681, grad_norm=5.574, loss_scale=1.000, learning_rate=6.755e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:14:20,247 (deepspeed_trainer:228) INFO: 36epoch:train:1001-1100batch: iter_time=1.082e-04, loss_ctc=68.784, loss_att=46.513, acc=0.733, loss=53.208, grad_norm=5.674, loss_scale=1.000, learning_rate=6.754e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:14:53,175 (deepspeed_trainer:228) INFO: 36epoch:train:1101-1200batch: iter_time=1.108e-04, loss_ctc=73.047, loss_att=51.038, acc=0.727, loss=57.638, grad_norm=6.134, loss_scale=1.000, learning_rate=6.754e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:15:26,358 (deepspeed_trainer:228) INFO: 36epoch:train:1201-1300batch: iter_time=1.229e-04, loss_ctc=70.622, loss_att=51.378, acc=0.729, loss=57.163, grad_norm=4.944, loss_scale=1.000, learning_rate=6.753e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:15:59,406 (deepspeed_trainer:228) INFO: 36epoch:train:1301-1400batch: iter_time=1.101e-04, loss_ctc=69.427, loss_att=54.921, acc=0.710, loss=59.263, grad_norm=5.944, loss_scale=1.000, learning_rate=6.753e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:16:32,938 (deepspeed_trainer:228) INFO: 36epoch:train:1401-1500batch: iter_time=2.214e-04, loss_ctc=68.046, loss_att=55.680, acc=0.711, loss=59.406, grad_norm=5.698, loss_scale=1.000, learning_rate=6.752e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 13:17:06,014 (deepspeed_trainer:228) INFO: 36epoch:train:1501-1600batch: iter_time=1.179e-04, loss_ctc=70.994, loss_att=49.617, acc=0.728, loss=56.043, grad_norm=5.443, loss_scale=1.000, learning_rate=6.751e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:17:39,246 (deepspeed_trainer:228) INFO: 36epoch:train:1601-1700batch: iter_time=1.108e-04, loss_ctc=67.660, loss_att=52.652, acc=0.717, loss=57.149, grad_norm=5.720, loss_scale=1.000, learning_rate=6.751e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:18:12,395 (deepspeed_trainer:228) INFO: 36epoch:train:1701-1800batch: iter_time=1.139e-04, loss_ctc=62.144, loss_att=46.072, acc=0.731, loss=50.888, grad_norm=5.074, loss_scale=1.000, learning_rate=6.750e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:18:41,794 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 13:19:07,682 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 13:19:24,750 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 13:19:24,750 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 13:19:24,752 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 13:19:56,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,913] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:19:56,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:46,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:46,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:47,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:46,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:47,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:46,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:48,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:47,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:49,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:49,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:49,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:48,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:48,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:48,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:49,131] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:20:49,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:35,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:36,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:36,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:35,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:37,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:36,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:36,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:37,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:38,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:38,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:37,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:38,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:38,812] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:38,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:39,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:21:41,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:23,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:24,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:24,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:25,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:25,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:25,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:26,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:27,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:26,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:27,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:27,677] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:27,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:28,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:28,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:30,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:22:32,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 13:23:08,977 (deepspeed_trainer:228) INFO: 36epoch:train:1801-1900batch: iter_time=2.561, loss_ctc=67.875, loss_att=49.029, acc=0.726, loss=54.663, grad_norm=4.951, loss_scale=1.000, learning_rate=6.749e-05, step_time=0.413 [2024-12-08 13:23:42,801] [INFO] [logging.py:129:log_dist] [Rank 0] step=527000, skipped=0, lr=[np.float64(6.748385773571065e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:23:42,804] [INFO] [timer.py:264:stop] epoch=0/micro_step=137000/global_step=137000, RunningAvgSamplesPerSec=49.24045048815542, CurrSamplesPerSec=52.54282637687001, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:23:42,806 (deepspeed_trainer:228) INFO: 36epoch:train:1901-2000batch: iter_time=1.152e-04, loss_ctc=75.498, loss_att=51.861, acc=0.722, loss=58.946, grad_norm=5.983, loss_scale=1.000, learning_rate=6.749e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 13:24:16,391 (deepspeed_trainer:228) INFO: 36epoch:train:2001-2100batch: iter_time=1.116e-04, loss_ctc=63.599, loss_att=48.189, acc=0.723, loss=52.808, grad_norm=5.522, loss_scale=1.000, learning_rate=6.748e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 13:24:50,469 (deepspeed_trainer:228) INFO: 36epoch:train:2101-2200batch: iter_time=1.138e-04, loss_ctc=65.790, loss_att=49.263, acc=0.725, loss=54.217, grad_norm=5.364, loss_scale=1.000, learning_rate=6.747e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 13:25:23,716 (deepspeed_trainer:228) INFO: 36epoch:train:2201-2300batch: iter_time=1.183e-04, loss_ctc=66.878, loss_att=50.323, acc=0.731, loss=55.310, grad_norm=5.403, loss_scale=1.000, learning_rate=6.747e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:25:57,161 (deepspeed_trainer:228) INFO: 36epoch:train:2301-2400batch: iter_time=1.113e-04, loss_ctc=73.816, loss_att=53.202, acc=0.725, loss=59.386, grad_norm=5.644, loss_scale=1.000, learning_rate=6.746e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 13:26:30,624 (deepspeed_trainer:228) INFO: 36epoch:train:2401-2500batch: iter_time=1.129e-04, loss_ctc=73.451, loss_att=54.348, acc=0.717, loss=60.096, grad_norm=6.211, loss_scale=1.000, learning_rate=6.746e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 13:27:04,131 (deepspeed_trainer:228) INFO: 36epoch:train:2501-2600batch: iter_time=1.100e-04, loss_ctc=68.414, loss_att=47.822, acc=0.736, loss=53.983, grad_norm=5.965, loss_scale=1.000, learning_rate=6.745e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 13:27:37,375 (deepspeed_trainer:228) INFO: 36epoch:train:2601-2700batch: iter_time=1.115e-04, loss_ctc=73.906, loss_att=57.704, acc=0.711, loss=62.522, grad_norm=5.756, loss_scale=1.000, learning_rate=6.744e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:28:11,240 (deepspeed_trainer:228) INFO: 36epoch:train:2701-2800batch: iter_time=1.114e-04, loss_ctc=70.876, loss_att=56.279, acc=0.723, loss=60.655, grad_norm=5.337, loss_scale=1.000, learning_rate=6.744e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 13:28:44,851 (deepspeed_trainer:228) INFO: 36epoch:train:2801-2900batch: iter_time=1.118e-04, loss_ctc=73.618, loss_att=53.137, acc=0.728, loss=59.275, grad_norm=6.227, loss_scale=1.000, learning_rate=6.743e-05, step_time=0.336 [2024-12-08 13:29:18,121] [INFO] [logging.py:129:log_dist] [Rank 0] step=528000, skipped=0, lr=[np.float64(6.741992240173095e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:29:18,122] [INFO] [timer.py:264:stop] epoch=0/micro_step=138000/global_step=138000, RunningAvgSamplesPerSec=49.23955460801323, CurrSamplesPerSec=50.38338439350229, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:29:18,125 (deepspeed_trainer:228) INFO: 36epoch:train:2901-3000batch: iter_time=1.098e-04, loss_ctc=70.562, loss_att=51.060, acc=0.739, loss=56.909, grad_norm=5.546, loss_scale=1.000, learning_rate=6.742e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:29:51,340 (deepspeed_trainer:228) INFO: 36epoch:train:3001-3100batch: iter_time=1.091e-04, loss_ctc=69.739, loss_att=49.961, acc=0.730, loss=55.880, grad_norm=6.009, loss_scale=1.000, learning_rate=6.742e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:30:24,815 (deepspeed_trainer:228) INFO: 36epoch:train:3101-3200batch: iter_time=1.110e-04, loss_ctc=65.899, loss_att=47.046, acc=0.738, loss=52.691, grad_norm=5.333, loss_scale=1.000, learning_rate=6.741e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 13:30:58,175 (deepspeed_trainer:228) INFO: 36epoch:train:3201-3300batch: iter_time=1.101e-04, loss_ctc=72.907, loss_att=58.753, acc=0.720, loss=63.001, grad_norm=6.303, loss_scale=1.000, learning_rate=6.740e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 13:31:31,273 (deepspeed_trainer:228) INFO: 36epoch:train:3301-3400batch: iter_time=1.100e-04, loss_ctc=67.469, loss_att=52.673, acc=0.725, loss=57.135, grad_norm=5.660, loss_scale=1.000, learning_rate=6.740e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:32:04,399 (deepspeed_trainer:228) INFO: 36epoch:train:3401-3500batch: iter_time=1.095e-04, loss_ctc=69.649, loss_att=51.272, acc=0.729, loss=56.760, grad_norm=5.665, loss_scale=1.000, learning_rate=6.739e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:32:37,760 (deepspeed_trainer:228) INFO: 36epoch:train:3501-3600batch: iter_time=1.114e-04, loss_ctc=68.748, loss_att=54.042, acc=0.726, loss=58.466, grad_norm=5.985, loss_scale=1.000, learning_rate=6.738e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 13:33:10,807 (deepspeed_trainer:228) INFO: 36epoch:train:3601-3700batch: iter_time=1.088e-04, loss_ctc=60.781, loss_att=44.833, acc=0.739, loss=49.613, grad_norm=5.139, loss_scale=1.000, learning_rate=6.738e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:33:30,968 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 13:33:57,669 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 13:34:13,553 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 13:34:13,554 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 13:34:13,556 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 13:34:39,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:40,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:40,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:40,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:41,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:42,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:42,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:42,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:42,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:43,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:43,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:43,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:43,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:43,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:44,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:34:47,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:28,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:29,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:29,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:30,416] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:32,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:32,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:32,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:32,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:32,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:33,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:33,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:33,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:33,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:34,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:34,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:35:38,354] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:18,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:18,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:18,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:20,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:21,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:22,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:22,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:22,730] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:22,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:23,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:23,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:23,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:23,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:24,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:24,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:36:30,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:06,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:07,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:09,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:09,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:09,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:10,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:10,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:12,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:12,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:13,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:13,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:13,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:13,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:15,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:16,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:37:21,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 13:38:04,288 (deepspeed_trainer:228) INFO: 36epoch:train:3701-3800batch: iter_time=2.536, loss_ctc=67.051, loss_att=46.629, acc=0.731, loss=52.757, grad_norm=5.500, loss_scale=1.000, learning_rate=6.737e-05, step_time=0.398 [cnode7-012:0/16] 2024-12-08 13:38:37,512 (deepspeed_trainer:228) INFO: 36epoch:train:3801-3900batch: iter_time=1.072e-04, loss_ctc=75.456, loss_att=53.042, acc=0.722, loss=59.761, grad_norm=6.093, loss_scale=1.000, learning_rate=6.737e-05, step_time=0.332 [2024-12-08 13:39:10,408] [INFO] [logging.py:129:log_dist] [Rank 0] step=529000, skipped=0, lr=[np.float64(6.735616844430989e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:39:10,408] [INFO] [timer.py:264:stop] epoch=0/micro_step=139000/global_step=139000, RunningAvgSamplesPerSec=49.23518723288008, CurrSamplesPerSec=49.62582506468896, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:39:10,411 (deepspeed_trainer:228) INFO: 36epoch:train:3901-4000batch: iter_time=1.071e-04, loss_ctc=58.859, loss_att=42.858, acc=0.733, loss=47.646, grad_norm=5.136, loss_scale=1.000, learning_rate=6.736e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 13:39:43,860 (deepspeed_trainer:228) INFO: 36epoch:train:4001-4100batch: iter_time=1.073e-04, loss_ctc=68.762, loss_att=51.503, acc=0.726, loss=56.697, grad_norm=5.631, loss_scale=1.000, learning_rate=6.735e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 13:40:17,096 (deepspeed_trainer:228) INFO: 36epoch:train:4101-4200batch: iter_time=1.104e-04, loss_ctc=71.620, loss_att=51.235, acc=0.731, loss=57.351, grad_norm=5.949, loss_scale=1.000, learning_rate=6.735e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:40:49,784 (deepspeed_trainer:228) INFO: 36epoch:train:4201-4300batch: iter_time=1.058e-04, loss_ctc=64.848, loss_att=47.092, acc=0.728, loss=52.452, grad_norm=5.142, loss_scale=1.000, learning_rate=6.734e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 13:41:22,723 (deepspeed_trainer:228) INFO: 36epoch:train:4301-4400batch: iter_time=1.087e-04, loss_ctc=75.084, loss_att=57.215, acc=0.720, loss=62.563, grad_norm=5.698, loss_scale=1.000, learning_rate=6.733e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:41:55,566 (deepspeed_trainer:228) INFO: 36epoch:train:4401-4500batch: iter_time=1.075e-04, loss_ctc=75.355, loss_att=54.004, acc=0.732, loss=60.398, grad_norm=5.780, loss_scale=1.000, learning_rate=6.733e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 13:42:28,234 (deepspeed_trainer:228) INFO: 36epoch:train:4501-4600batch: iter_time=1.082e-04, loss_ctc=68.407, loss_att=53.163, acc=0.719, loss=57.747, grad_norm=5.554, loss_scale=1.000, learning_rate=6.732e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 13:43:01,228 (deepspeed_trainer:228) INFO: 36epoch:train:4601-4700batch: iter_time=1.074e-04, loss_ctc=71.368, loss_att=54.241, acc=0.729, loss=59.389, grad_norm=6.022, loss_scale=1.000, learning_rate=6.731e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:43:34,089 (deepspeed_trainer:228) INFO: 36epoch:train:4701-4800batch: iter_time=1.080e-04, loss_ctc=72.220, loss_att=52.401, acc=0.732, loss=58.355, grad_norm=5.664, loss_scale=1.000, learning_rate=6.731e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 13:44:07,100 (deepspeed_trainer:228) INFO: 36epoch:train:4801-4900batch: iter_time=1.097e-04, loss_ctc=74.794, loss_att=54.847, acc=0.732, loss=60.819, grad_norm=6.289, loss_scale=1.000, learning_rate=6.730e-05, step_time=0.330 [2024-12-08 13:44:39,881] [INFO] [logging.py:129:log_dist] [Rank 0] step=530000, skipped=0, lr=[np.float64(6.729259500749506e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:44:39,882] [INFO] [timer.py:264:stop] epoch=0/micro_step=140000/global_step=140000, RunningAvgSamplesPerSec=49.24099182107409, CurrSamplesPerSec=50.914020605092965, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:44:39,884 (deepspeed_trainer:228) INFO: 36epoch:train:4901-5000batch: iter_time=1.072e-04, loss_ctc=64.775, loss_att=46.535, acc=0.733, loss=52.040, grad_norm=5.181, loss_scale=1.000, learning_rate=6.730e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 13:45:13,902 (deepspeed_trainer:228) INFO: 36epoch:train:5001-5100batch: iter_time=1.075e-04, loss_ctc=70.696, loss_att=50.231, acc=0.740, loss=56.382, grad_norm=5.535, loss_scale=1.000, learning_rate=6.729e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 13:45:47,235 (deepspeed_trainer:228) INFO: 36epoch:train:5101-5200batch: iter_time=1.078e-04, loss_ctc=69.385, loss_att=57.366, acc=0.723, loss=60.981, grad_norm=5.460, loss_scale=1.000, learning_rate=6.728e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 13:46:20,525 (deepspeed_trainer:228) INFO: 36epoch:train:5201-5300batch: iter_time=1.080e-04, loss_ctc=66.127, loss_att=49.994, acc=0.729, loss=54.817, grad_norm=4.913, loss_scale=1.000, learning_rate=6.728e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:46:53,637 (deepspeed_trainer:228) INFO: 36epoch:train:5301-5400batch: iter_time=1.068e-04, loss_ctc=68.891, loss_att=53.504, acc=0.721, loss=58.118, grad_norm=6.130, loss_scale=1.000, learning_rate=6.727e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 13:47:26,728 (deepspeed_trainer:228) INFO: 36epoch:train:5401-5500batch: iter_time=1.086e-04, loss_ctc=64.724, loss_att=50.286, acc=0.732, loss=54.624, grad_norm=5.230, loss_scale=1.000, learning_rate=6.726e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:47:59,480 (deepspeed_trainer:228) INFO: 36epoch:train:5501-5600batch: iter_time=1.082e-04, loss_ctc=60.594, loss_att=43.090, acc=0.740, loss=48.346, grad_norm=4.578, loss_scale=1.000, learning_rate=6.726e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 13:48:11,298 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 13:48:37,902 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 13:48:55,867 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 13:48:55,868 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 13:48:55,870 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 13:49:19,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:20,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:20,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:20,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:21,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:22,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:22,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:22,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:23,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:23,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:24,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:24,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:23,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:24,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:25,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:49:25,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:08,617] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:09,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:10,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:10,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:11,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:13,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:12,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:12,742] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:13,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:13,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:14,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:14,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:14,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:14,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:15,351] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:16,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:57,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:57,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:58,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:50:59,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:00,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:01,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:01,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:01,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:03,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:03,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:04,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:04,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:04,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:06,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:07,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:08,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:46,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:47,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:48,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:48,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:48,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:49,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:50,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:52,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:53,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:53,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:54,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:54,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:54,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:56,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:59,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 13:51:59,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 13:52:51,811 (deepspeed_trainer:228) INFO: 36epoch:train:5601-5700batch: iter_time=2.553, loss_ctc=72.163, loss_att=51.148, acc=0.731, loss=57.449, grad_norm=5.232, loss_scale=1.000, learning_rate=6.725e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-08 13:53:25,143 (deepspeed_trainer:228) INFO: 36epoch:train:5701-5800batch: iter_time=1.074e-04, loss_ctc=70.188, loss_att=49.993, acc=0.726, loss=56.027, grad_norm=5.438, loss_scale=1.000, learning_rate=6.725e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 13:53:58,426 (deepspeed_trainer:228) INFO: 36epoch:train:5801-5900batch: iter_time=1.130e-04, loss_ctc=62.581, loss_att=45.919, acc=0.731, loss=50.909, grad_norm=5.552, loss_scale=1.000, learning_rate=6.724e-05, step_time=0.333 [2024-12-08 13:54:31,483] [INFO] [logging.py:129:log_dist] [Rank 0] step=531000, skipped=0, lr=[np.float64(6.722920124097862e-05)], mom=[[0.9, 0.98]] [2024-12-08 13:54:31,484] [INFO] [timer.py:264:stop] epoch=0/micro_step=141000/global_step=141000, RunningAvgSamplesPerSec=49.23897192482085, CurrSamplesPerSec=49.527012554808955, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 13:54:31,487 (deepspeed_trainer:228) INFO: 36epoch:train:5901-6000batch: iter_time=1.131e-04, loss_ctc=64.725, loss_att=48.100, acc=0.728, loss=53.093, grad_norm=5.705, loss_scale=1.000, learning_rate=6.723e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:55:05,155 (deepspeed_trainer:228) INFO: 36epoch:train:6001-6100batch: iter_time=1.125e-04, loss_ctc=77.835, loss_att=54.553, acc=0.735, loss=61.544, grad_norm=6.023, loss_scale=1.000, learning_rate=6.723e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 13:55:39,150 (deepspeed_trainer:228) INFO: 36epoch:train:6101-6200batch: iter_time=1.125e-04, loss_ctc=66.722, loss_att=52.358, acc=0.725, loss=56.665, grad_norm=5.635, loss_scale=1.000, learning_rate=6.722e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 13:56:12,161 (deepspeed_trainer:228) INFO: 36epoch:train:6201-6300batch: iter_time=1.109e-04, loss_ctc=74.299, loss_att=51.127, acc=0.726, loss=58.080, grad_norm=6.424, loss_scale=1.000, learning_rate=6.721e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:56:45,055 (deepspeed_trainer:228) INFO: 36epoch:train:6301-6400batch: iter_time=1.097e-04, loss_ctc=69.938, loss_att=52.522, acc=0.728, loss=57.739, grad_norm=5.252, loss_scale=1.000, learning_rate=6.721e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:57:18,068 (deepspeed_trainer:228) INFO: 36epoch:train:6401-6500batch: iter_time=1.101e-04, loss_ctc=69.978, loss_att=53.982, acc=0.719, loss=58.783, grad_norm=5.411, loss_scale=1.000, learning_rate=6.720e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 13:57:51,440 (deepspeed_trainer:228) INFO: 36epoch:train:6501-6600batch: iter_time=1.106e-04, loss_ctc=76.746, loss_att=56.258, acc=0.727, loss=62.421, grad_norm=6.036, loss_scale=1.000, learning_rate=6.719e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 13:58:24,673 (deepspeed_trainer:228) INFO: 36epoch:train:6601-6700batch: iter_time=1.120e-04, loss_ctc=65.188, loss_att=47.753, acc=0.741, loss=52.993, grad_norm=5.163, loss_scale=1.000, learning_rate=6.719e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 13:58:57,590 (deepspeed_trainer:228) INFO: 36epoch:train:6701-6800batch: iter_time=1.124e-04, loss_ctc=76.554, loss_att=53.483, acc=0.731, loss=60.382, grad_norm=6.272, loss_scale=1.000, learning_rate=6.718e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 13:59:31,005 (deepspeed_trainer:228) INFO: 36epoch:train:6801-6900batch: iter_time=1.122e-04, loss_ctc=69.826, loss_att=51.034, acc=0.736, loss=56.657, grad_norm=5.460, loss_scale=1.000, learning_rate=6.718e-05, step_time=0.334 [2024-12-08 14:00:04,176] [INFO] [logging.py:129:log_dist] [Rank 0] step=532000, skipped=0, lr=[np.float64(6.716598630004944e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:00:04,176] [INFO] [timer.py:264:stop] epoch=0/micro_step=142000/global_step=142000, RunningAvgSamplesPerSec=49.24128190415398, CurrSamplesPerSec=50.88695722682251, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:00:04,179 (deepspeed_trainer:228) INFO: 36epoch:train:6901-7000batch: iter_time=1.122e-04, loss_ctc=68.671, loss_att=53.466, acc=0.730, loss=58.023, grad_norm=5.610, loss_scale=1.000, learning_rate=6.717e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:00:37,784 (deepspeed_trainer:228) INFO: 36epoch:train:7001-7100batch: iter_time=1.111e-04, loss_ctc=66.752, loss_att=52.136, acc=0.724, loss=56.532, grad_norm=5.651, loss_scale=1.000, learning_rate=6.716e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 14:01:11,913 (deepspeed_trainer:228) INFO: 36epoch:train:7101-7200batch: iter_time=1.102e-04, loss_ctc=70.335, loss_att=52.157, acc=0.731, loss=57.583, grad_norm=5.120, loss_scale=1.000, learning_rate=6.716e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 14:01:45,675 (deepspeed_trainer:228) INFO: 36epoch:train:7201-7300batch: iter_time=1.101e-04, loss_ctc=65.951, loss_att=53.647, acc=0.718, loss=57.338, grad_norm=5.690, loss_scale=1.000, learning_rate=6.715e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 14:02:18,862 (deepspeed_trainer:228) INFO: 36epoch:train:7301-7400batch: iter_time=1.161e-04, loss_ctc=60.552, loss_att=45.465, acc=0.742, loss=49.984, grad_norm=5.065, loss_scale=1.000, learning_rate=6.714e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:02:52,017 (deepspeed_trainer:228) INFO: 36epoch:train:7401-7500batch: iter_time=1.067e-04, loss_ctc=65.104, loss_att=46.708, acc=0.734, loss=52.236, grad_norm=4.845, loss_scale=1.000, learning_rate=6.714e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:02:55,347 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 14:03:23,642 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 14:03:39,547 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 14:03:39,547 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 14:03:39,550 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 14:04:04,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:05,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:06,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:06,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:06,668] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:06,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:07,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:07,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:07,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:07,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:08,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:08,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:08,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:08,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:09,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:10,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:54,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:54,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:55,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:55,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:57,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:57,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:58,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:58,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:58,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:58,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:58,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:58,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:59,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:04:59,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:00,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:02,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:42,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:42,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:44,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:47,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:47,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:47,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:47,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:48,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:48,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:48,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:49,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:49,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:50,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:51,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:52,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:05:55,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:31,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:31,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:33,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:35,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:36,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:37,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:37,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:37,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:37,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:39,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:39,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:40,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:40,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:41,752] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:45,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:06:48,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 14:07:49,654 (deepspeed_trainer:228) INFO: 36epoch:train:7501-7600batch: iter_time=2.559, loss_ctc=72.422, loss_att=52.417, acc=0.715, loss=58.415, grad_norm=5.453, loss_scale=1.000, learning_rate=6.713e-05, step_time=0.417 [cnode7-012:0/16] 2024-12-08 14:08:22,989 (deepspeed_trainer:228) INFO: 36epoch:train:7601-7700batch: iter_time=1.209e-04, loss_ctc=69.384, loss_att=49.803, acc=0.729, loss=55.664, grad_norm=5.559, loss_scale=1.000, learning_rate=6.712e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 14:08:56,069 (deepspeed_trainer:228) INFO: 36epoch:train:7701-7800batch: iter_time=1.113e-04, loss_ctc=60.644, loss_att=44.486, acc=0.727, loss=49.339, grad_norm=5.381, loss_scale=1.000, learning_rate=6.712e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:09:29,393 (deepspeed_trainer:228) INFO: 36epoch:train:7801-7900batch: iter_time=1.085e-04, loss_ctc=65.656, loss_att=51.405, acc=0.720, loss=55.690, grad_norm=5.877, loss_scale=1.000, learning_rate=6.711e-05, step_time=0.333 [2024-12-08 14:10:02,417] [INFO] [logging.py:129:log_dist] [Rank 0] step=533000, skipped=0, lr=[np.float64(6.71029493455459e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:10:02,418] [INFO] [timer.py:264:stop] epoch=0/micro_step=143000/global_step=143000, RunningAvgSamplesPerSec=49.23339407936079, CurrSamplesPerSec=50.17079312594887, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:10:02,421 (deepspeed_trainer:228) INFO: 36epoch:train:7901-8000batch: iter_time=1.102e-04, loss_ctc=78.344, loss_att=53.781, acc=0.734, loss=61.167, grad_norm=5.226, loss_scale=1.000, learning_rate=6.711e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:10:35,185 (deepspeed_trainer:228) INFO: 36epoch:train:8001-8100batch: iter_time=1.111e-04, loss_ctc=68.172, loss_att=51.936, acc=0.718, loss=56.810, grad_norm=5.848, loss_scale=1.000, learning_rate=6.710e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 14:11:08,059 (deepspeed_trainer:228) INFO: 36epoch:train:8101-8200batch: iter_time=1.120e-04, loss_ctc=68.142, loss_att=47.909, acc=0.731, loss=53.982, grad_norm=5.935, loss_scale=1.000, learning_rate=6.709e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 14:11:40,989 (deepspeed_trainer:228) INFO: 36epoch:train:8201-8300batch: iter_time=1.108e-04, loss_ctc=72.682, loss_att=57.667, acc=0.713, loss=62.178, grad_norm=5.907, loss_scale=1.000, learning_rate=6.709e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:12:13,894 (deepspeed_trainer:228) INFO: 36epoch:train:8301-8400batch: iter_time=1.177e-04, loss_ctc=68.047, loss_att=52.773, acc=0.713, loss=57.321, grad_norm=5.315, loss_scale=1.000, learning_rate=6.708e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:12:46,909 (deepspeed_trainer:228) INFO: 36epoch:train:8401-8500batch: iter_time=1.112e-04, loss_ctc=76.156, loss_att=56.665, acc=0.726, loss=62.520, grad_norm=6.161, loss_scale=1.000, learning_rate=6.707e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:13:19,828 (deepspeed_trainer:228) INFO: 36epoch:train:8501-8600batch: iter_time=1.107e-04, loss_ctc=67.588, loss_att=46.402, acc=0.735, loss=52.779, grad_norm=5.635, loss_scale=1.000, learning_rate=6.707e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:13:52,720 (deepspeed_trainer:228) INFO: 36epoch:train:8601-8700batch: iter_time=1.142e-04, loss_ctc=70.914, loss_att=50.264, acc=0.729, loss=56.468, grad_norm=6.260, loss_scale=1.000, learning_rate=6.706e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 14:14:25,801 (deepspeed_trainer:228) INFO: 36epoch:train:8701-8800batch: iter_time=1.117e-04, loss_ctc=69.825, loss_att=51.263, acc=0.733, loss=56.832, grad_norm=5.518, loss_scale=1.000, learning_rate=6.706e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:14:59,027 (deepspeed_trainer:228) INFO: 36epoch:train:8801-8900batch: iter_time=1.101e-04, loss_ctc=67.955, loss_att=53.801, acc=0.716, loss=58.052, grad_norm=5.720, loss_scale=1.000, learning_rate=6.705e-05, step_time=0.332 [2024-12-08 14:15:32,153] [INFO] [logging.py:129:log_dist] [Rank 0] step=534000, skipped=0, lr=[np.float64(6.704008954380906e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:15:32,154] [INFO] [timer.py:264:stop] epoch=0/micro_step=144000/global_step=144000, RunningAvgSamplesPerSec=49.23881509519059, CurrSamplesPerSec=47.30442961173905, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:15:32,156 (deepspeed_trainer:228) INFO: 36epoch:train:8901-9000batch: iter_time=1.147e-04, loss_ctc=66.377, loss_att=54.486, acc=0.718, loss=58.048, grad_norm=5.519, loss_scale=1.000, learning_rate=6.704e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:16:05,547 (deepspeed_trainer:228) INFO: 36epoch:train:9001-9100batch: iter_time=1.134e-04, loss_ctc=69.863, loss_att=48.947, acc=0.733, loss=55.235, grad_norm=5.106, loss_scale=1.000, learning_rate=6.704e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 14:16:39,060 (deepspeed_trainer:228) INFO: 36epoch:train:9101-9200batch: iter_time=1.110e-04, loss_ctc=66.797, loss_att=52.173, acc=0.717, loss=56.572, grad_norm=6.492, loss_scale=1.000, learning_rate=6.703e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:17:12,253 (deepspeed_trainer:228) INFO: 36epoch:train:9201-9300batch: iter_time=1.139e-04, loss_ctc=61.073, loss_att=45.798, acc=0.733, loss=50.361, grad_norm=4.613, loss_scale=1.000, learning_rate=6.702e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:17:41,234 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 14:18:08,572 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 14:18:25,244 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 14:18:25,245 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 14:18:25,247 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 14:18:51,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:51,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:52,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:52,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:52,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:52,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:53,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:52,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:53,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:53,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:54,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:54,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:54,257] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:54,806] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:54,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:18:55,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:39,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:40,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:40,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:41,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:42,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:42,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:42,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:42,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:43,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:43,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:44,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:46,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:46,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:47,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:47,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:19:50,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:29,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:29,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:31,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:31,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:31,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:32,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:33,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:33,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:33,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:34,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:34,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:36,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:36,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:37,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:37,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:20:40,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:17,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:18,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:18,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:19,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:20,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:20,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:21,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:21,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:21,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:23,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:23,987] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:24,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:25,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:27,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:27,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:21:29,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 14:22:04,874 (deepspeed_trainer:228) INFO: 36epoch:train:9301-9400batch: iter_time=2.549, loss_ctc=66.899, loss_att=48.704, acc=0.730, loss=54.163, grad_norm=5.006, loss_scale=1.000, learning_rate=6.702e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-08 14:22:38,730 (deepspeed_trainer:228) INFO: 36epoch:train:9401-9500batch: iter_time=1.111e-04, loss_ctc=74.096, loss_att=51.078, acc=0.726, loss=57.990, grad_norm=5.730, loss_scale=1.000, learning_rate=6.701e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 14:23:12,207 (deepspeed_trainer:228) INFO: 36epoch:train:9501-9600batch: iter_time=1.118e-04, loss_ctc=63.749, loss_att=47.741, acc=0.727, loss=52.564, grad_norm=5.264, loss_scale=1.000, learning_rate=6.701e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 14:23:45,504 (deepspeed_trainer:228) INFO: 36epoch:train:9601-9700batch: iter_time=1.089e-04, loss_ctc=64.351, loss_att=48.053, acc=0.730, loss=52.933, grad_norm=5.353, loss_scale=1.000, learning_rate=6.700e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 14:24:19,188 (deepspeed_trainer:228) INFO: 36epoch:train:9701-9800batch: iter_time=1.117e-04, loss_ctc=65.513, loss_att=49.714, acc=0.737, loss=54.417, grad_norm=5.842, loss_scale=1.000, learning_rate=6.699e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 14:24:52,630 (deepspeed_trainer:228) INFO: 36epoch:train:9801-9900batch: iter_time=1.106e-04, loss_ctc=71.851, loss_att=52.926, acc=0.730, loss=58.575, grad_norm=5.356, loss_scale=1.000, learning_rate=6.699e-05, step_time=0.334 [2024-12-08 14:25:25,901] [INFO] [logging.py:129:log_dist] [Rank 0] step=535000, skipped=0, lr=[np.float64(6.697740606663636e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:25:25,901] [INFO] [timer.py:264:stop] epoch=0/micro_step=145000/global_step=145000, RunningAvgSamplesPerSec=49.23418289821721, CurrSamplesPerSec=48.460589685331946, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:25:25,904 (deepspeed_trainer:228) INFO: 36epoch:train:9901-10000batch: iter_time=1.116e-04, loss_ctc=71.772, loss_att=53.271, acc=0.723, loss=58.864, grad_norm=5.699, loss_scale=1.000, learning_rate=6.698e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 14:25:59,955 (deepspeed_trainer:228) INFO: 36epoch:train:10001-10100batch: iter_time=1.128e-04, loss_ctc=67.510, loss_att=46.874, acc=0.741, loss=53.066, grad_norm=5.490, loss_scale=1.000, learning_rate=6.697e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 14:26:33,116 (deepspeed_trainer:228) INFO: 36epoch:train:10101-10200batch: iter_time=1.163e-04, loss_ctc=73.112, loss_att=56.966, acc=0.716, loss=61.814, grad_norm=5.824, loss_scale=1.000, learning_rate=6.697e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:27:06,529 (deepspeed_trainer:228) INFO: 36epoch:train:10201-10300batch: iter_time=1.141e-04, loss_ctc=70.093, loss_att=55.824, acc=0.728, loss=60.124, grad_norm=5.580, loss_scale=1.000, learning_rate=6.696e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 14:27:39,665 (deepspeed_trainer:228) INFO: 36epoch:train:10301-10400batch: iter_time=1.123e-04, loss_ctc=72.638, loss_att=52.200, acc=0.733, loss=58.309, grad_norm=5.922, loss_scale=1.000, learning_rate=6.696e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:28:12,722 (deepspeed_trainer:228) INFO: 36epoch:train:10401-10500batch: iter_time=1.134e-04, loss_ctc=69.916, loss_att=50.563, acc=0.741, loss=56.387, grad_norm=5.483, loss_scale=1.000, learning_rate=6.695e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:28:46,288 (deepspeed_trainer:228) INFO: 36epoch:train:10501-10600batch: iter_time=1.128e-04, loss_ctc=68.448, loss_att=49.309, acc=0.733, loss=55.055, grad_norm=6.146, loss_scale=1.000, learning_rate=6.694e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:29:19,280 (deepspeed_trainer:228) INFO: 36epoch:train:10601-10700batch: iter_time=1.142e-04, loss_ctc=65.413, loss_att=46.525, acc=0.741, loss=52.202, grad_norm=5.304, loss_scale=1.000, learning_rate=6.694e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:29:52,879 (deepspeed_trainer:228) INFO: 36epoch:train:10701-10800batch: iter_time=1.135e-04, loss_ctc=72.269, loss_att=58.531, acc=0.724, loss=62.639, grad_norm=6.024, loss_scale=1.000, learning_rate=6.693e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:30:26,043 (deepspeed_trainer:228) INFO: 36epoch:train:10801-10900batch: iter_time=1.137e-04, loss_ctc=67.234, loss_att=52.568, acc=0.727, loss=56.980, grad_norm=5.661, loss_scale=1.000, learning_rate=6.692e-05, step_time=0.332 [2024-12-08 14:30:58,984] [INFO] [logging.py:129:log_dist] [Rank 0] step=536000, skipped=0, lr=[np.float64(6.691489809123581e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:30:58,985] [INFO] [timer.py:264:stop] epoch=0/micro_step=146000/global_step=146000, RunningAvgSamplesPerSec=49.2362714642169, CurrSamplesPerSec=52.98551239129179, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:30:58,988 (deepspeed_trainer:228) INFO: 36epoch:train:10901-11000batch: iter_time=1.125e-04, loss_ctc=68.360, loss_att=50.633, acc=0.732, loss=55.994, grad_norm=4.911, loss_scale=1.000, learning_rate=6.692e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:31:31,987 (deepspeed_trainer:228) INFO: 36epoch:train:11001-11100batch: iter_time=1.105e-04, loss_ctc=68.023, loss_att=53.671, acc=0.727, loss=57.972, grad_norm=6.037, loss_scale=1.000, learning_rate=6.691e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:32:05,162 (deepspeed_trainer:228) INFO: 36epoch:train:11101-11200batch: iter_time=1.107e-04, loss_ctc=59.417, loss_att=43.833, acc=0.742, loss=48.513, grad_norm=4.961, loss_scale=1.000, learning_rate=6.691e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 14:32:25,747 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 14:32:53,242 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 14:33:09,236 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 14:33:09,236 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 14:33:09,238 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 14:33:37,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:37,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:37,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:38,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:39,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:39,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:39,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:39,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:39,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:39,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:40,018] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:40,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:40,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:40,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:40,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:33:40,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:26,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:26,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:27,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:27,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:27,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:28,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:28,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:29,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:28,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:29,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:30,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:30,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:30,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:30,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:33,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:34:33,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:15,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:16,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:16,581] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:16,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:18,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:18,926] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:18,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:19,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:19,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:20,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:20,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:20,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:21,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:21,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:27,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:35:27,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:05,021] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:05,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:05,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:07,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:07,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:07,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:08,376] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:08,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:09,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:10,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:10,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:10,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:10,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:11,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:19,462] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:36:19,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 14:37:03,115 (deepspeed_trainer:228) INFO: 36epoch:train:11201-11300batch: iter_time=2.553, loss_ctc=67.243, loss_att=47.299, acc=0.731, loss=53.267, grad_norm=5.307, loss_scale=1.000, learning_rate=6.690e-05, step_time=0.426 [cnode7-012:0/16] 2024-12-08 14:37:36,634 (deepspeed_trainer:228) INFO: 36epoch:train:11301-11400batch: iter_time=1.114e-04, loss_ctc=74.168, loss_att=52.850, acc=0.720, loss=59.249, grad_norm=5.920, loss_scale=1.000, learning_rate=6.689e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:38:10,767 (deepspeed_trainer:228) INFO: 36epoch:train:11401-11500batch: iter_time=1.106e-04, loss_ctc=58.602, loss_att=42.051, acc=0.733, loss=47.012, grad_norm=5.160, loss_scale=1.000, learning_rate=6.689e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 14:38:43,984 (deepspeed_trainer:228) INFO: 36epoch:train:11501-11600batch: iter_time=1.100e-04, loss_ctc=68.232, loss_att=51.161, acc=0.722, loss=56.273, grad_norm=5.828, loss_scale=1.000, learning_rate=6.688e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:39:17,379 (deepspeed_trainer:228) INFO: 36epoch:train:11601-11700batch: iter_time=1.131e-04, loss_ctc=70.599, loss_att=49.666, acc=0.730, loss=55.941, grad_norm=5.481, loss_scale=1.000, learning_rate=6.687e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 14:39:50,677 (deepspeed_trainer:228) INFO: 36epoch:train:11701-11800batch: iter_time=1.082e-04, loss_ctc=63.832, loss_att=47.152, acc=0.727, loss=52.137, grad_norm=4.970, loss_scale=1.000, learning_rate=6.687e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 14:40:24,154 (deepspeed_trainer:228) INFO: 36epoch:train:11801-11900batch: iter_time=1.083e-04, loss_ctc=74.048, loss_att=55.800, acc=0.717, loss=61.265, grad_norm=6.000, loss_scale=1.000, learning_rate=6.686e-05, step_time=0.334 [2024-12-08 14:40:57,679] [INFO] [logging.py:129:log_dist] [Rank 0] step=537000, skipped=0, lr=[np.float64(6.68525648001806e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:40:57,680] [INFO] [timer.py:264:stop] epoch=0/micro_step=147000/global_step=147000, RunningAvgSamplesPerSec=49.2277884130836, CurrSamplesPerSec=44.403252305733595, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:40:57,682 (deepspeed_trainer:228) INFO: 36epoch:train:11901-12000batch: iter_time=1.089e-04, loss_ctc=75.216, loss_att=54.014, acc=0.729, loss=60.356, grad_norm=5.871, loss_scale=1.000, learning_rate=6.686e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 14:41:30,900 (deepspeed_trainer:228) INFO: 36epoch:train:12001-12100batch: iter_time=1.098e-04, loss_ctc=67.552, loss_att=53.646, acc=0.716, loss=57.820, grad_norm=5.818, loss_scale=1.000, learning_rate=6.685e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 14:42:04,082 (deepspeed_trainer:228) INFO: 36epoch:train:12101-12200batch: iter_time=1.090e-04, loss_ctc=70.757, loss_att=52.875, acc=0.727, loss=58.232, grad_norm=5.349, loss_scale=1.000, learning_rate=6.684e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 14:42:37,591 (deepspeed_trainer:228) INFO: 36epoch:train:12201-12300batch: iter_time=1.087e-04, loss_ctc=71.740, loss_att=51.206, acc=0.733, loss=57.349, grad_norm=5.526, loss_scale=1.000, learning_rate=6.684e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:43:10,537 (deepspeed_trainer:228) INFO: 36epoch:train:12301-12400batch: iter_time=1.094e-04, loss_ctc=76.653, loss_att=54.461, acc=0.726, loss=61.119, grad_norm=7.164, loss_scale=1.000, learning_rate=6.683e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:43:43,206 (deepspeed_trainer:228) INFO: 36epoch:train:12401-12500batch: iter_time=1.075e-04, loss_ctc=64.227, loss_att=46.351, acc=0.729, loss=51.700, grad_norm=5.429, loss_scale=1.000, learning_rate=6.682e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 14:44:16,158 (deepspeed_trainer:228) INFO: 36epoch:train:12501-12600batch: iter_time=1.080e-04, loss_ctc=69.667, loss_att=49.183, acc=0.734, loss=55.323, grad_norm=5.302, loss_scale=1.000, learning_rate=6.682e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:44:49,802 (deepspeed_trainer:228) INFO: 36epoch:train:12601-12700batch: iter_time=1.083e-04, loss_ctc=68.349, loss_att=56.068, acc=0.719, loss=59.735, grad_norm=5.666, loss_scale=1.000, learning_rate=6.681e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:45:22,886 (deepspeed_trainer:228) INFO: 36epoch:train:12701-12800batch: iter_time=1.114e-04, loss_ctc=66.034, loss_att=50.352, acc=0.728, loss=55.048, grad_norm=5.487, loss_scale=1.000, learning_rate=6.681e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:45:55,912 (deepspeed_trainer:228) INFO: 36epoch:train:12801-12900batch: iter_time=1.098e-04, loss_ctc=68.257, loss_att=52.082, acc=0.719, loss=56.929, grad_norm=5.402, loss_scale=1.000, learning_rate=6.680e-05, step_time=0.330 [2024-12-08 14:46:28,913] [INFO] [logging.py:129:log_dist] [Rank 0] step=538000, skipped=0, lr=[np.float64(6.679040538136424e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:46:28,914] [INFO] [timer.py:264:stop] epoch=0/micro_step=148000/global_step=148000, RunningAvgSamplesPerSec=49.2315005912455, CurrSamplesPerSec=48.530855021276096, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:46:28,916 (deepspeed_trainer:228) INFO: 36epoch:train:12901-13000batch: iter_time=1.087e-04, loss_ctc=63.576, loss_att=49.891, acc=0.729, loss=53.983, grad_norm=5.269, loss_scale=1.000, learning_rate=6.679e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:47:01,750 (deepspeed_trainer:228) INFO: 36epoch:train:13001-13100batch: iter_time=1.111e-04, loss_ctc=60.083, loss_att=42.672, acc=0.738, loss=47.887, grad_norm=4.849, loss_scale=1.000, learning_rate=6.679e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 14:47:14,053 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 14:47:42,363 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 14:47:58,912 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 14:47:58,913 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 14:47:58,915 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 14:48:23,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:24,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:24,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:24,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:24,962] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:25,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:25,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:25,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:26,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:27,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:27,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:27,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:27,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:28,603] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:28,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:48:29,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:13,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:14,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:14,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:14,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:15,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:15,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:15,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:16,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:16,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:17,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:17,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:17,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:18,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:18,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:18,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:49:21,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:03,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:03,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:05,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:05,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:05,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:06,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:07,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:07,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:08,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:08,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:08,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:08,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:09,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:10,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:11,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:14,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:50,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:52,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:54,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:54,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:55,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:56,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:56,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:57,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:57,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:57,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:58,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:50:59,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:51:00,177] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:51:00,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:51:01,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 14:51:06,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 14:51:58,581 (deepspeed_trainer:228) INFO: 36epoch:train:13101-13200batch: iter_time=2.552, loss_ctc=72.126, loss_att=50.392, acc=0.728, loss=56.919, grad_norm=5.427, loss_scale=1.000, learning_rate=6.678e-05, step_time=0.416 [cnode7-012:0/16] 2024-12-08 14:52:32,255 (deepspeed_trainer:228) INFO: 36epoch:train:13201-13300batch: iter_time=1.097e-04, loss_ctc=69.084, loss_att=49.194, acc=0.726, loss=55.127, grad_norm=6.120, loss_scale=1.000, learning_rate=6.677e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 14:53:05,734 (deepspeed_trainer:228) INFO: 36epoch:train:13301-13400batch: iter_time=1.042e-04, loss_ctc=61.672, loss_att=44.864, acc=0.727, loss=49.909, grad_norm=5.205, loss_scale=1.000, learning_rate=6.677e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:53:39,089 (deepspeed_trainer:228) INFO: 36epoch:train:13401-13500batch: iter_time=1.126e-04, loss_ctc=64.555, loss_att=47.216, acc=0.729, loss=52.436, grad_norm=5.565, loss_scale=1.000, learning_rate=6.676e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 14:54:12,676 (deepspeed_trainer:228) INFO: 36epoch:train:13501-13600batch: iter_time=1.092e-04, loss_ctc=77.121, loss_att=53.486, acc=0.732, loss=60.549, grad_norm=6.052, loss_scale=1.000, learning_rate=6.676e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 14:54:45,877 (deepspeed_trainer:228) INFO: 36epoch:train:13601-13700batch: iter_time=1.106e-04, loss_ctc=64.897, loss_att=50.333, acc=0.723, loss=54.707, grad_norm=5.292, loss_scale=1.000, learning_rate=6.675e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 14:55:19,137 (deepspeed_trainer:228) INFO: 36epoch:train:13701-13800batch: iter_time=1.108e-04, loss_ctc=73.306, loss_att=50.283, acc=0.725, loss=57.187, grad_norm=6.957, loss_scale=1.000, learning_rate=6.674e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 14:55:52,665 (deepspeed_trainer:228) INFO: 36epoch:train:13801-13900batch: iter_time=1.121e-04, loss_ctc=69.402, loss_att=52.405, acc=0.727, loss=57.504, grad_norm=5.021, loss_scale=1.000, learning_rate=6.674e-05, step_time=0.335 [2024-12-08 14:56:25,708] [INFO] [logging.py:129:log_dist] [Rank 0] step=539000, skipped=0, lr=[np.float64(6.672841902795607e-05)], mom=[[0.9, 0.98]] [2024-12-08 14:56:25,709] [INFO] [timer.py:264:stop] epoch=0/micro_step=149000/global_step=149000, RunningAvgSamplesPerSec=49.22470097757004, CurrSamplesPerSec=47.79400632096199, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 14:56:25,712 (deepspeed_trainer:228) INFO: 36epoch:train:13901-14000batch: iter_time=1.112e-04, loss_ctc=69.161, loss_att=53.700, acc=0.715, loss=58.355, grad_norm=5.240, loss_scale=1.000, learning_rate=6.673e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 14:56:58,863 (deepspeed_trainer:228) INFO: 36epoch:train:14001-14100batch: iter_time=1.117e-04, loss_ctc=76.112, loss_att=55.528, acc=0.724, loss=61.708, grad_norm=6.294, loss_scale=1.000, learning_rate=6.673e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:57:31,706 (deepspeed_trainer:228) INFO: 36epoch:train:14101-14200batch: iter_time=1.076e-04, loss_ctc=63.951, loss_att=46.956, acc=0.740, loss=52.061, grad_norm=5.228, loss_scale=1.000, learning_rate=6.672e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 14:58:04,537 (deepspeed_trainer:228) INFO: 36epoch:train:14201-14300batch: iter_time=1.109e-04, loss_ctc=72.779, loss_att=52.237, acc=0.725, loss=58.390, grad_norm=5.822, loss_scale=1.000, learning_rate=6.671e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 14:58:37,436 (deepspeed_trainer:228) INFO: 36epoch:train:14301-14400batch: iter_time=1.090e-04, loss_ctc=70.567, loss_att=49.844, acc=0.734, loss=56.056, grad_norm=4.878, loss_scale=1.000, learning_rate=6.671e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 14:59:10,536 (deepspeed_trainer:228) INFO: 36epoch:train:14401-14500batch: iter_time=1.094e-04, loss_ctc=67.803, loss_att=52.761, acc=0.722, loss=57.271, grad_norm=4.991, loss_scale=1.000, learning_rate=6.670e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 14:59:43,451 (deepspeed_trainer:228) INFO: 36epoch:train:14501-14600batch: iter_time=1.136e-04, loss_ctc=65.978, loss_att=50.997, acc=0.722, loss=55.480, grad_norm=5.723, loss_scale=1.000, learning_rate=6.669e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 15:00:16,498 (deepspeed_trainer:228) INFO: 36epoch:train:14601-14700batch: iter_time=1.106e-04, loss_ctc=69.569, loss_att=51.649, acc=0.730, loss=57.031, grad_norm=5.745, loss_scale=1.000, learning_rate=6.669e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:00:49,527 (deepspeed_trainer:228) INFO: 36epoch:train:14701-14800batch: iter_time=1.132e-04, loss_ctc=65.433, loss_att=52.318, acc=0.718, loss=56.262, grad_norm=5.395, loss_scale=1.000, learning_rate=6.668e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:01:22,325 (deepspeed_trainer:228) INFO: 36epoch:train:14801-14900batch: iter_time=1.117e-04, loss_ctc=59.767, loss_att=45.628, acc=0.734, loss=49.838, grad_norm=4.892, loss_scale=1.000, learning_rate=6.668e-05, step_time=0.328 [2024-12-08 15:01:55,075] [INFO] [logging.py:129:log_dist] [Rank 0] step=540000, skipped=0, lr=[np.float64(6.666660493835734e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:01:55,075] [INFO] [timer.py:264:stop] epoch=0/micro_step=150000/global_step=150000, RunningAvgSamplesPerSec=49.23018596697038, CurrSamplesPerSec=53.14252089919824, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:01:55,078 (deepspeed_trainer:228) INFO: 36epoch:train:14901-15000batch: iter_time=1.077e-04, loss_ctc=64.784, loss_att=46.237, acc=0.732, loss=51.803, grad_norm=5.369, loss_scale=1.000, learning_rate=6.667e-05, step_time=0.327 [2024-12-08 15:02:10,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:12,012] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:12,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,751] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:12,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:12,164] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:11,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:12,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:26,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:26,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:27,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:27,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:27,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:29,052] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:29,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:29,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:28,911] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:29,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:42,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:43,120] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:43,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:43,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:43,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:43,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:44,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:44,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:44,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:45,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:58,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:59,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:59,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:59,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:59,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:02:59,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:00,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:00,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:00,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:01,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:01,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:00,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:01,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:01,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:01,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:02,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:03:14,637] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 36 is about to be saved! [2024-12-08 15:03:14,668] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/mp_rank_00_model_states.pt [2024-12-08 15:03:14,668] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/mp_rank_00_model_states.pt... [2024-12-08 15:03:16,762] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/mp_rank_00_model_states.pt. [2024-12-08 15:03:16,920] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,578] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,922] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,580] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,580] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,580] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,581] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,925] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,582] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,925] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,926] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,585] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 15:03:16,585] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 15:03:17,665] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,665] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,665] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,355] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,355] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,356] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,356] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,356] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,356] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,700] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,700] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,700] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,700] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,701] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,701] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,372] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,373] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,373] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,380] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,380] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,380] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,723] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,381] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,382] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,382] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,725] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,725] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,725] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,384] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,384] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,384] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,729] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,729] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,403] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,404] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,404] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,407] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,408] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,408] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,760] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,761] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,761] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,762] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,763] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,763] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [2024-12-08 15:03:17,767] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 15:03:17,767] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_36/36/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 15:03:17,767] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 36 is ready now! [cnode7-012:0/16] 2024-12-08 15:03:17,775 (deepspeed_trainer:158) INFO: 36epoch results: [train] iter_time=0.135, loss_ctc=68.982, loss_att=51.045, acc=0.727, loss=56.425, grad_norm=5.591, loss_scale=1.000, learning_rate=6.714e-05, step_time=0.336, time=1 hour, 57 minutes and 54.86 seconds, total_count=540036, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.797, cer_ctc=0.075, loss_att=8.688, acc=0.757, cer=0.487, wer=1.000, loss=7.219, time=1 minute and 10.1 seconds, total_count=36, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 15:03:19,811 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 15:03:46,668 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 15:04:04,543 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 15:04:04,543 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 15:04:04,545 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 15:04:24,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:25,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:25,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:26,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:26,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:27,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:26,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:27,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:27,305] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:27,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:27,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:28,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:28,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:28,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:29,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:04:30,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:13,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:15,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:15,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:15,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:16,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:16,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:16,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:16,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:16,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:17,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:17,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:17,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:18,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:18,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:18,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:05:20,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:01,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:03,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:04,091] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:04,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:04,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:05,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:05,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:05,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:05,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:07,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:07,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:07,893] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:08,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:08,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:08,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:11,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:49,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:52,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:53,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:53,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:54,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:54,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:55,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:55,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:55,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:56,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:57,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:58,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:59,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:06:59,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:07:01,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:07:01,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 15:08:02,192 (deepspeed_trainer:228) INFO: 37epoch:train:1-100batch: iter_time=2.472, loss_ctc=70.838, loss_att=53.072, acc=0.718, loss=58.389, grad_norm=6.260, loss_scale=1.000, learning_rate=6.666e-05, step_time=0.352 [cnode7-012:0/16] 2024-12-08 15:08:35,684 (deepspeed_trainer:228) INFO: 37epoch:train:101-200batch: iter_time=1.153e-04, loss_ctc=79.824, loss_att=57.395, acc=0.730, loss=64.101, grad_norm=6.440, loss_scale=1.000, learning_rate=6.666e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:09:09,497 (deepspeed_trainer:228) INFO: 37epoch:train:201-300batch: iter_time=1.085e-04, loss_ctc=81.659, loss_att=62.634, acc=0.717, loss=68.344, grad_norm=6.369, loss_scale=1.000, learning_rate=6.665e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 15:09:43,062 (deepspeed_trainer:228) INFO: 37epoch:train:301-400batch: iter_time=1.073e-04, loss_ctc=86.351, loss_att=60.292, acc=0.716, loss=68.158, grad_norm=7.110, loss_scale=1.000, learning_rate=6.664e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 15:10:16,624 (deepspeed_trainer:228) INFO: 37epoch:train:401-500batch: iter_time=1.106e-04, loss_ctc=85.243, loss_att=64.275, acc=0.699, loss=70.576, grad_norm=8.147, loss_scale=1.000, learning_rate=6.664e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 15:10:50,072 (deepspeed_trainer:228) INFO: 37epoch:train:501-600batch: iter_time=1.114e-04, loss_ctc=68.880, loss_att=54.580, acc=0.732, loss=58.842, grad_norm=5.262, loss_scale=1.000, learning_rate=6.663e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:11:23,389 (deepspeed_trainer:228) INFO: 37epoch:train:601-700batch: iter_time=1.140e-04, loss_ctc=67.609, loss_att=52.556, acc=0.717, loss=57.072, grad_norm=5.404, loss_scale=1.000, learning_rate=6.663e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:11:56,674 (deepspeed_trainer:228) INFO: 37epoch:train:701-800batch: iter_time=1.158e-04, loss_ctc=74.125, loss_att=55.228, acc=0.719, loss=60.864, grad_norm=6.260, loss_scale=1.000, learning_rate=6.662e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 15:12:29,975 (deepspeed_trainer:228) INFO: 37epoch:train:801-900batch: iter_time=1.091e-04, loss_ctc=75.790, loss_att=52.525, acc=0.725, loss=59.493, grad_norm=6.824, loss_scale=1.000, learning_rate=6.661e-05, step_time=0.333 [2024-12-08 15:13:03,352] [INFO] [logging.py:129:log_dist] [Rank 0] step=541000, skipped=0, lr=[np.float64(6.66049623161576e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:13:03,353] [INFO] [timer.py:264:stop] epoch=0/micro_step=151000/global_step=151000, RunningAvgSamplesPerSec=49.22912231635514, CurrSamplesPerSec=48.24365555703988, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:13:03,356 (deepspeed_trainer:228) INFO: 37epoch:train:901-1000batch: iter_time=1.103e-04, loss_ctc=73.350, loss_att=55.996, acc=0.723, loss=61.207, grad_norm=5.379, loss_scale=1.000, learning_rate=6.661e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:13:36,784 (deepspeed_trainer:228) INFO: 37epoch:train:1001-1100batch: iter_time=1.124e-04, loss_ctc=73.995, loss_att=58.566, acc=0.722, loss=63.208, grad_norm=5.138, loss_scale=1.000, learning_rate=6.660e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:14:10,179 (deepspeed_trainer:228) INFO: 37epoch:train:1101-1200batch: iter_time=1.123e-04, loss_ctc=63.835, loss_att=47.291, acc=0.723, loss=52.243, grad_norm=5.522, loss_scale=1.000, learning_rate=6.660e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:14:43,882 (deepspeed_trainer:228) INFO: 37epoch:train:1201-1300batch: iter_time=1.199e-04, loss_ctc=68.561, loss_att=52.862, acc=0.713, loss=57.542, grad_norm=5.709, loss_scale=1.000, learning_rate=6.659e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 15:15:17,213 (deepspeed_trainer:228) INFO: 37epoch:train:1301-1400batch: iter_time=1.127e-04, loss_ctc=68.525, loss_att=47.310, acc=0.729, loss=53.670, grad_norm=5.367, loss_scale=1.000, learning_rate=6.658e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:15:50,924 (deepspeed_trainer:228) INFO: 37epoch:train:1401-1500batch: iter_time=1.103e-04, loss_ctc=65.541, loss_att=46.984, acc=0.732, loss=52.581, grad_norm=5.350, loss_scale=1.000, learning_rate=6.658e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 15:16:24,501 (deepspeed_trainer:228) INFO: 37epoch:train:1501-1600batch: iter_time=1.162e-04, loss_ctc=76.757, loss_att=52.276, acc=0.730, loss=59.587, grad_norm=6.507, loss_scale=1.000, learning_rate=6.657e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 15:16:57,538 (deepspeed_trainer:228) INFO: 37epoch:train:1601-1700batch: iter_time=1.117e-04, loss_ctc=82.718, loss_att=59.630, acc=0.721, loss=66.534, grad_norm=6.362, loss_scale=1.000, learning_rate=6.656e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:17:30,994 (deepspeed_trainer:228) INFO: 37epoch:train:1701-1800batch: iter_time=1.184e-04, loss_ctc=80.672, loss_att=59.519, acc=0.726, loss=65.883, grad_norm=6.837, loss_scale=1.000, learning_rate=6.656e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:18:00,640 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 15:18:27,234 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 15:18:43,137 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 15:18:43,137 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 15:18:43,140 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 15:19:11,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:11,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:12,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:12,865] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:13,133] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:13,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:13,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:13,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,805] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:19:15,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:01,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:03,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:03,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:04,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:04,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:04,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:04,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:05,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:07,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:07,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:08,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:09,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:09,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:09,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:09,395] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:09,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:48,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:50,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:52,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:52,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:53,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:53,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:53,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:55,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:56,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:58,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:58,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:59,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:59,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:59,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:20:59,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:00,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:36,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:39,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:40,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:41,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:42,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:42,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:42,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:44,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:46,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:47,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:47,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:48,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:48,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:49,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:49,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:21:49,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 15:22:25,998 (deepspeed_trainer:228) INFO: 37epoch:train:1801-1900batch: iter_time=2.605, loss_ctc=70.082, loss_att=55.629, acc=0.714, loss=59.969, grad_norm=5.949, loss_scale=1.000, learning_rate=6.655e-05, step_time=0.344 [2024-12-08 15:22:59,296] [INFO] [logging.py:129:log_dist] [Rank 0] step=542000, skipped=0, lr=[np.float64(6.654349037009168e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:22:59,297] [INFO] [timer.py:264:stop] epoch=0/micro_step=152000/global_step=152000, RunningAvgSamplesPerSec=49.22901567185877, CurrSamplesPerSec=49.32733624566423, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:22:59,300 (deepspeed_trainer:228) INFO: 37epoch:train:1901-2000batch: iter_time=1.079e-04, loss_ctc=68.323, loss_att=50.553, acc=0.722, loss=55.859, grad_norm=5.933, loss_scale=1.000, learning_rate=6.655e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:23:33,152 (deepspeed_trainer:228) INFO: 37epoch:train:2001-2100batch: iter_time=1.103e-04, loss_ctc=82.791, loss_att=58.427, acc=0.728, loss=65.706, grad_norm=5.855, loss_scale=1.000, learning_rate=6.654e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 15:24:07,189 (deepspeed_trainer:228) INFO: 37epoch:train:2101-2200batch: iter_time=1.103e-04, loss_ctc=85.097, loss_att=65.620, acc=0.707, loss=71.454, grad_norm=6.133, loss_scale=1.000, learning_rate=6.653e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 15:24:41,125 (deepspeed_trainer:228) INFO: 37epoch:train:2201-2300batch: iter_time=1.159e-04, loss_ctc=79.519, loss_att=54.547, acc=0.723, loss=62.039, grad_norm=7.342, loss_scale=1.000, learning_rate=6.653e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 15:25:14,948 (deepspeed_trainer:228) INFO: 37epoch:train:2301-2400batch: iter_time=1.184e-04, loss_ctc=79.757, loss_att=63.228, acc=0.701, loss=68.175, grad_norm=6.533, loss_scale=1.000, learning_rate=6.652e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 15:25:48,398 (deepspeed_trainer:228) INFO: 37epoch:train:2401-2500batch: iter_time=1.142e-04, loss_ctc=64.151, loss_att=49.675, acc=0.728, loss=54.012, grad_norm=5.724, loss_scale=1.000, learning_rate=6.652e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:26:21,944 (deepspeed_trainer:228) INFO: 37epoch:train:2501-2600batch: iter_time=1.179e-04, loss_ctc=74.293, loss_att=55.787, acc=0.715, loss=61.329, grad_norm=6.110, loss_scale=1.000, learning_rate=6.651e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 15:26:54,985 (deepspeed_trainer:228) INFO: 37epoch:train:2601-2700batch: iter_time=1.209e-04, loss_ctc=68.002, loss_att=51.962, acc=0.718, loss=56.768, grad_norm=5.998, loss_scale=1.000, learning_rate=6.650e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:27:28,708 (deepspeed_trainer:228) INFO: 37epoch:train:2701-2800batch: iter_time=1.222e-04, loss_ctc=78.401, loss_att=55.009, acc=0.726, loss=62.012, grad_norm=6.038, loss_scale=1.000, learning_rate=6.650e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 15:28:02,363 (deepspeed_trainer:228) INFO: 37epoch:train:2801-2900batch: iter_time=1.208e-04, loss_ctc=76.069, loss_att=60.571, acc=0.712, loss=65.233, grad_norm=5.319, loss_scale=1.000, learning_rate=6.649e-05, step_time=0.336 [2024-12-08 15:28:35,324] [INFO] [logging.py:129:log_dist] [Rank 0] step=543000, skipped=0, lr=[np.float64(6.648218831399692e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:28:35,325] [INFO] [timer.py:264:stop] epoch=0/micro_step=153000/global_step=153000, RunningAvgSamplesPerSec=49.227791694619654, CurrSamplesPerSec=51.5594514842648, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:28:35,328 (deepspeed_trainer:228) INFO: 37epoch:train:2901-3000batch: iter_time=1.162e-04, loss_ctc=67.479, loss_att=51.004, acc=0.721, loss=55.964, grad_norm=5.946, loss_scale=1.000, learning_rate=6.649e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 15:29:08,529 (deepspeed_trainer:228) INFO: 37epoch:train:3001-3100batch: iter_time=1.134e-04, loss_ctc=64.889, loss_att=48.238, acc=0.721, loss=53.239, grad_norm=4.962, loss_scale=1.000, learning_rate=6.648e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 15:29:41,806 (deepspeed_trainer:228) INFO: 37epoch:train:3101-3200batch: iter_time=1.100e-04, loss_ctc=67.427, loss_att=48.429, acc=0.727, loss=54.111, grad_norm=5.421, loss_scale=1.000, learning_rate=6.647e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 15:30:14,704 (deepspeed_trainer:228) INFO: 37epoch:train:3201-3300batch: iter_time=1.140e-04, loss_ctc=66.561, loss_att=48.941, acc=0.724, loss=54.214, grad_norm=5.738, loss_scale=1.000, learning_rate=6.647e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 15:30:47,269 (deepspeed_trainer:228) INFO: 37epoch:train:3301-3400batch: iter_time=1.133e-04, loss_ctc=64.876, loss_att=44.793, acc=0.730, loss=50.812, grad_norm=5.064, loss_scale=1.000, learning_rate=6.646e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 15:31:20,312 (deepspeed_trainer:228) INFO: 37epoch:train:3401-3500batch: iter_time=1.111e-04, loss_ctc=76.389, loss_att=54.413, acc=0.730, loss=61.016, grad_norm=6.754, loss_scale=1.000, learning_rate=6.645e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:31:53,933 (deepspeed_trainer:228) INFO: 37epoch:train:3501-3600batch: iter_time=1.142e-04, loss_ctc=85.205, loss_att=60.991, acc=0.718, loss=68.265, grad_norm=6.279, loss_scale=1.000, learning_rate=6.645e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 15:32:27,026 (deepspeed_trainer:228) INFO: 37epoch:train:3601-3700batch: iter_time=1.114e-04, loss_ctc=75.318, loss_att=55.577, acc=0.720, loss=61.505, grad_norm=5.760, loss_scale=1.000, learning_rate=6.644e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 15:32:47,099 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 15:33:13,896 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 15:33:31,130 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 15:33:31,130 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 15:33:31,132 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 15:33:57,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:33:57,891] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:33:58,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:33:58,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:33:58,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:33:59,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:00,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:00,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:00,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:00,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:01,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:01,315] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:01,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:01,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:01,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:02,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:45,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:46,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:46,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:47,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:48,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:48,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:49,291] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:50,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:50,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:50,597] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:51,163] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:51,280] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:51,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:51,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:51,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:34:52,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:33,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:34,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:35,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:36,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:37,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:39,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:39,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:39,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:39,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:41,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:41,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:42,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:42,141] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:42,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:42,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:35:44,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:20,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:22,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:24,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:24,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:26,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:26,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:27,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:28,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:29,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:29,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:30,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:31,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:31,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:31,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:32,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:36:36,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 15:37:20,594 (deepspeed_trainer:228) INFO: 37epoch:train:3701-3800batch: iter_time=2.548, loss_ctc=69.244, loss_att=52.593, acc=0.717, loss=57.604, grad_norm=5.536, loss_scale=1.000, learning_rate=6.644e-05, step_time=0.387 [cnode7-012:0/16] 2024-12-08 15:37:54,831 (deepspeed_trainer:228) INFO: 37epoch:train:3801-3900batch: iter_time=1.136e-04, loss_ctc=73.510, loss_att=54.398, acc=0.725, loss=60.113, grad_norm=5.771, loss_scale=1.000, learning_rate=6.643e-05, step_time=0.342 [2024-12-08 15:38:28,255] [INFO] [logging.py:129:log_dist] [Rank 0] step=544000, skipped=0, lr=[np.float64(6.642105536677108e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:38:28,256] [INFO] [timer.py:264:stop] epoch=0/micro_step=154000/global_step=154000, RunningAvgSamplesPerSec=49.224814321278735, CurrSamplesPerSec=45.35985026222081, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:38:28,258 (deepspeed_trainer:228) INFO: 37epoch:train:3901-4000batch: iter_time=1.091e-04, loss_ctc=81.184, loss_att=56.502, acc=0.726, loss=63.938, grad_norm=5.969, loss_scale=1.000, learning_rate=6.642e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:39:01,876 (deepspeed_trainer:228) INFO: 37epoch:train:4001-4100batch: iter_time=1.099e-04, loss_ctc=84.937, loss_att=63.654, acc=0.713, loss=70.032, grad_norm=5.955, loss_scale=1.000, learning_rate=6.642e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 15:39:35,290 (deepspeed_trainer:228) INFO: 37epoch:train:4101-4200batch: iter_time=1.097e-04, loss_ctc=80.360, loss_att=56.918, acc=0.712, loss=63.931, grad_norm=7.341, loss_scale=1.000, learning_rate=6.641e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:40:08,645 (deepspeed_trainer:228) INFO: 37epoch:train:4201-4300batch: iter_time=1.111e-04, loss_ctc=74.654, loss_att=59.264, acc=0.717, loss=63.867, grad_norm=5.847, loss_scale=1.000, learning_rate=6.641e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:40:41,909 (deepspeed_trainer:228) INFO: 37epoch:train:4301-4400batch: iter_time=1.183e-04, loss_ctc=66.188, loss_att=50.136, acc=0.721, loss=54.919, grad_norm=5.869, loss_scale=1.000, learning_rate=6.640e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 15:41:15,585 (deepspeed_trainer:228) INFO: 37epoch:train:4401-4500batch: iter_time=1.103e-04, loss_ctc=72.017, loss_att=52.225, acc=0.721, loss=58.181, grad_norm=6.272, loss_scale=1.000, learning_rate=6.639e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 15:41:48,762 (deepspeed_trainer:228) INFO: 37epoch:train:4501-4600batch: iter_time=1.081e-04, loss_ctc=74.024, loss_att=54.821, acc=0.715, loss=60.569, grad_norm=6.106, loss_scale=1.000, learning_rate=6.639e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 15:42:22,007 (deepspeed_trainer:228) INFO: 37epoch:train:4601-4700batch: iter_time=1.127e-04, loss_ctc=70.318, loss_att=49.631, acc=0.737, loss=55.839, grad_norm=5.607, loss_scale=1.000, learning_rate=6.638e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 15:42:55,122 (deepspeed_trainer:228) INFO: 37epoch:train:4701-4800batch: iter_time=1.122e-04, loss_ctc=74.234, loss_att=58.768, acc=0.712, loss=63.409, grad_norm=5.625, loss_scale=1.000, learning_rate=6.638e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 15:43:27,979 (deepspeed_trainer:228) INFO: 37epoch:train:4801-4900batch: iter_time=1.093e-04, loss_ctc=72.268, loss_att=55.963, acc=0.718, loss=60.828, grad_norm=5.646, loss_scale=1.000, learning_rate=6.637e-05, step_time=0.328 [2024-12-08 15:44:00,886] [INFO] [logging.py:129:log_dist] [Rank 0] step=545000, skipped=0, lr=[np.float64(6.636009075233045e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:44:00,887] [INFO] [timer.py:264:stop] epoch=0/micro_step=155000/global_step=155000, RunningAvgSamplesPerSec=49.22718672944301, CurrSamplesPerSec=50.325538694593185, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:44:00,889 (deepspeed_trainer:228) INFO: 37epoch:train:4901-5000batch: iter_time=1.131e-04, loss_ctc=63.406, loss_att=48.556, acc=0.711, loss=53.001, grad_norm=5.727, loss_scale=1.000, learning_rate=6.636e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 15:44:34,659 (deepspeed_trainer:228) INFO: 37epoch:train:5001-5100batch: iter_time=1.112e-04, loss_ctc=64.808, loss_att=43.728, acc=0.741, loss=50.041, grad_norm=4.892, loss_scale=1.000, learning_rate=6.636e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 15:45:07,526 (deepspeed_trainer:228) INFO: 37epoch:train:5101-5200batch: iter_time=1.100e-04, loss_ctc=69.150, loss_att=51.214, acc=0.716, loss=56.605, grad_norm=6.292, loss_scale=1.000, learning_rate=6.635e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 15:45:40,070 (deepspeed_trainer:228) INFO: 37epoch:train:5201-5300batch: iter_time=1.110e-04, loss_ctc=66.744, loss_att=45.324, acc=0.732, loss=51.764, grad_norm=5.474, loss_scale=1.000, learning_rate=6.634e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 15:46:13,133 (deepspeed_trainer:228) INFO: 37epoch:train:5301-5400batch: iter_time=1.106e-04, loss_ctc=85.259, loss_att=61.538, acc=0.725, loss=68.630, grad_norm=7.017, loss_scale=1.000, learning_rate=6.634e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:46:46,581 (deepspeed_trainer:228) INFO: 37epoch:train:5401-5500batch: iter_time=1.098e-04, loss_ctc=79.238, loss_att=56.035, acc=0.723, loss=62.997, grad_norm=5.695, loss_scale=1.000, learning_rate=6.633e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:47:19,600 (deepspeed_trainer:228) INFO: 37epoch:train:5501-5600batch: iter_time=1.108e-04, loss_ctc=67.299, loss_att=50.781, acc=0.726, loss=55.754, grad_norm=5.755, loss_scale=1.000, learning_rate=6.633e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:47:31,668 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 15:47:58,155 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 15:48:14,850 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 15:48:14,850 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 15:48:14,853 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 15:48:40,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:41,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:41,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:41,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:42,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:42,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:43,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:43,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:43,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:43,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:44,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:44,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:44,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:44,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:45,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:48:45,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:29,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:30,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:31,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:31,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:32,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:32,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:33,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:33,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:33,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:33,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:33,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:33,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:34,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:34,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:35,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:49:36,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:17,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:19,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:19,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:20,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:21,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:21,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:22,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:22,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:22,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:23,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:23,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:24,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:24,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:24,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:24,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:50:28,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:04,356] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:07,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:07,337] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:08,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:09,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:09,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:10,474] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:11,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:12,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:13,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:13,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:14,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:15,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:15,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:15,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 15:51:18,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 15:52:10,852 (deepspeed_trainer:228) INFO: 37epoch:train:5601-5700batch: iter_time=2.518, loss_ctc=71.716, loss_att=55.158, acc=0.721, loss=60.128, grad_norm=5.897, loss_scale=1.000, learning_rate=6.632e-05, step_time=0.399 [cnode7-012:0/16] 2024-12-08 15:52:44,084 (deepspeed_trainer:228) INFO: 37epoch:train:5701-5800batch: iter_time=1.136e-04, loss_ctc=75.512, loss_att=53.967, acc=0.733, loss=60.478, grad_norm=6.025, loss_scale=1.000, learning_rate=6.631e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 15:53:17,597 (deepspeed_trainer:228) INFO: 37epoch:train:5801-5900batch: iter_time=1.274e-04, loss_ctc=78.943, loss_att=61.758, acc=0.727, loss=66.866, grad_norm=5.900, loss_scale=1.000, learning_rate=6.631e-05, step_time=0.335 [2024-12-08 15:53:51,077] [INFO] [logging.py:129:log_dist] [Rank 0] step=546000, skipped=0, lr=[np.float64(6.629929369956847e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:53:51,077] [INFO] [timer.py:264:stop] epoch=0/micro_step=156000/global_step=156000, RunningAvgSamplesPerSec=49.223815676328314, CurrSamplesPerSec=49.73336085471479, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:53:51,080 (deepspeed_trainer:228) INFO: 37epoch:train:5901-6000batch: iter_time=1.116e-04, loss_ctc=86.842, loss_att=63.468, acc=0.721, loss=70.478, grad_norm=7.746, loss_scale=1.000, learning_rate=6.630e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 15:54:24,386 (deepspeed_trainer:228) INFO: 37epoch:train:6001-6100batch: iter_time=1.088e-04, loss_ctc=80.471, loss_att=58.383, acc=0.712, loss=65.024, grad_norm=6.667, loss_scale=1.000, learning_rate=6.630e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:54:57,729 (deepspeed_trainer:228) INFO: 37epoch:train:6101-6200batch: iter_time=1.121e-04, loss_ctc=65.966, loss_att=55.540, acc=0.726, loss=58.678, grad_norm=5.176, loss_scale=1.000, learning_rate=6.629e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:55:31,026 (deepspeed_trainer:228) INFO: 37epoch:train:6201-6300batch: iter_time=1.135e-04, loss_ctc=72.047, loss_att=56.135, acc=0.718, loss=60.956, grad_norm=5.786, loss_scale=1.000, learning_rate=6.628e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 15:56:03,905 (deepspeed_trainer:228) INFO: 37epoch:train:6301-6400batch: iter_time=1.093e-04, loss_ctc=68.950, loss_att=48.874, acc=0.727, loss=54.905, grad_norm=6.466, loss_scale=1.000, learning_rate=6.628e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 15:56:36,729 (deepspeed_trainer:228) INFO: 37epoch:train:6401-6500batch: iter_time=1.193e-04, loss_ctc=71.371, loss_att=52.477, acc=0.726, loss=58.148, grad_norm=6.616, loss_scale=1.000, learning_rate=6.627e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 15:57:09,878 (deepspeed_trainer:228) INFO: 37epoch:train:6501-6600batch: iter_time=1.121e-04, loss_ctc=76.474, loss_att=57.658, acc=0.737, loss=63.310, grad_norm=5.065, loss_scale=1.000, learning_rate=6.627e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 15:57:42,893 (deepspeed_trainer:228) INFO: 37epoch:train:6601-6700batch: iter_time=1.109e-04, loss_ctc=71.593, loss_att=57.474, acc=0.720, loss=61.733, grad_norm=6.074, loss_scale=1.000, learning_rate=6.626e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:58:15,891 (deepspeed_trainer:228) INFO: 37epoch:train:6701-6800batch: iter_time=1.098e-04, loss_ctc=65.274, loss_att=49.038, acc=0.730, loss=53.898, grad_norm=6.014, loss_scale=1.000, learning_rate=6.625e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 15:58:49,033 (deepspeed_trainer:228) INFO: 37epoch:train:6801-6900batch: iter_time=1.116e-04, loss_ctc=66.599, loss_att=52.765, acc=0.713, loss=56.907, grad_norm=5.247, loss_scale=1.000, learning_rate=6.625e-05, step_time=0.331 [2024-12-08 15:59:21,840] [INFO] [logging.py:129:log_dist] [Rank 0] step=547000, skipped=0, lr=[np.float64(6.62386634423148e-05)], mom=[[0.9, 0.98]] [2024-12-08 15:59:21,840] [INFO] [timer.py:264:stop] epoch=0/micro_step=157000/global_step=157000, RunningAvgSamplesPerSec=49.22776547084125, CurrSamplesPerSec=51.0553242534401, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 15:59:21,843 (deepspeed_trainer:228) INFO: 37epoch:train:6901-7000batch: iter_time=1.117e-04, loss_ctc=69.173, loss_att=46.426, acc=0.732, loss=53.271, grad_norm=5.862, loss_scale=1.000, learning_rate=6.624e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 15:59:54,607 (deepspeed_trainer:228) INFO: 37epoch:train:7001-7100batch: iter_time=1.110e-04, loss_ctc=61.739, loss_att=45.495, acc=0.736, loss=50.390, grad_norm=4.972, loss_scale=1.000, learning_rate=6.624e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 16:00:27,400 (deepspeed_trainer:228) INFO: 37epoch:train:7101-7200batch: iter_time=1.158e-04, loss_ctc=71.733, loss_att=44.577, acc=0.740, loss=52.726, grad_norm=5.386, loss_scale=1.000, learning_rate=6.623e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 16:01:00,650 (deepspeed_trainer:228) INFO: 37epoch:train:7201-7300batch: iter_time=1.106e-04, loss_ctc=80.501, loss_att=63.585, acc=0.723, loss=68.658, grad_norm=6.589, loss_scale=1.000, learning_rate=6.622e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:01:33,989 (deepspeed_trainer:228) INFO: 37epoch:train:7301-7400batch: iter_time=1.100e-04, loss_ctc=81.281, loss_att=60.809, acc=0.722, loss=66.972, grad_norm=5.654, loss_scale=1.000, learning_rate=6.622e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:02:07,117 (deepspeed_trainer:228) INFO: 37epoch:train:7401-7500batch: iter_time=1.098e-04, loss_ctc=65.632, loss_att=52.135, acc=0.727, loss=56.201, grad_norm=5.710, loss_scale=1.000, learning_rate=6.621e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:02:10,656 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 16:02:37,773 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 16:02:55,047 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 16:02:55,047 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 16:02:55,050 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 16:03:20,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:20,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:20,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:21,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:21,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:21,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:22,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:22,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:22,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:22,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:22,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:23,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:23,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:23,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:23,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:03:24,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:09,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:10,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:10,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:10,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:12,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:12,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:12,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:12,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:13,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:14,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:14,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:14,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:14,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:15,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:15,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:15,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:58,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:58,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:04:59,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:00,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:00,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:01,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:02,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:02,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:03,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:04,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:04,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:05,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:05,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:05,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:06,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:06,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:46,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:47,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:48,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:49,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:50,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:50,401] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:50,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:51,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:51,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:55,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:55,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:56,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:56,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:56,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:57,101] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:05:57,492] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 16:07:01,287 (deepspeed_trainer:228) INFO: 37epoch:train:7501-7600batch: iter_time=2.590, loss_ctc=69.028, loss_att=50.659, acc=0.724, loss=56.157, grad_norm=6.092, loss_scale=1.000, learning_rate=6.621e-05, step_time=0.351 [cnode7-012:0/16] 2024-12-08 16:07:34,999 (deepspeed_trainer:228) INFO: 37epoch:train:7601-7700batch: iter_time=1.109e-04, loss_ctc=78.533, loss_att=55.436, acc=0.736, loss=62.394, grad_norm=6.310, loss_scale=1.000, learning_rate=6.620e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 16:08:08,523 (deepspeed_trainer:228) INFO: 37epoch:train:7701-7800batch: iter_time=1.079e-04, loss_ctc=79.606, loss_att=60.690, acc=0.725, loss=66.388, grad_norm=5.653, loss_scale=1.000, learning_rate=6.619e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 16:08:42,281 (deepspeed_trainer:228) INFO: 37epoch:train:7801-7900batch: iter_time=1.112e-04, loss_ctc=82.509, loss_att=59.333, acc=0.723, loss=66.274, grad_norm=7.279, loss_scale=1.000, learning_rate=6.619e-05, step_time=0.337 [2024-12-08 16:09:15,651] [INFO] [logging.py:129:log_dist] [Rank 0] step=548000, skipped=0, lr=[np.float64(6.617819921929475e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:09:15,652] [INFO] [timer.py:264:stop] epoch=0/micro_step=158000/global_step=158000, RunningAvgSamplesPerSec=49.2278185397734, CurrSamplesPerSec=52.01043318290129, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:09:15,654 (deepspeed_trainer:228) INFO: 37epoch:train:7901-8000batch: iter_time=1.089e-04, loss_ctc=81.572, loss_att=63.367, acc=0.702, loss=68.811, grad_norm=6.985, loss_scale=1.000, learning_rate=6.618e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:09:48,933 (deepspeed_trainer:228) INFO: 37epoch:train:8001-8100batch: iter_time=1.091e-04, loss_ctc=68.198, loss_att=54.220, acc=0.736, loss=58.381, grad_norm=6.667, loss_scale=1.000, learning_rate=6.618e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:10:22,021 (deepspeed_trainer:228) INFO: 37epoch:train:8101-8200batch: iter_time=1.112e-04, loss_ctc=67.016, loss_att=51.355, acc=0.723, loss=56.070, grad_norm=5.718, loss_scale=1.000, learning_rate=6.617e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 16:10:55,250 (deepspeed_trainer:228) INFO: 37epoch:train:8201-8300batch: iter_time=1.158e-04, loss_ctc=71.648, loss_att=53.461, acc=0.729, loss=58.898, grad_norm=5.836, loss_scale=1.000, learning_rate=6.616e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:11:28,331 (deepspeed_trainer:228) INFO: 37epoch:train:8301-8400batch: iter_time=1.118e-04, loss_ctc=73.406, loss_att=51.606, acc=0.732, loss=58.157, grad_norm=6.196, loss_scale=1.000, learning_rate=6.616e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:12:01,466 (deepspeed_trainer:228) INFO: 37epoch:train:8401-8500batch: iter_time=1.120e-04, loss_ctc=71.868, loss_att=55.482, acc=0.726, loss=60.388, grad_norm=5.401, loss_scale=1.000, learning_rate=6.615e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:12:35,288 (deepspeed_trainer:228) INFO: 37epoch:train:8501-8600batch: iter_time=1.130e-04, loss_ctc=72.722, loss_att=57.706, acc=0.725, loss=62.213, grad_norm=5.310, loss_scale=1.000, learning_rate=6.614e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 16:13:07,965 (deepspeed_trainer:228) INFO: 37epoch:train:8601-8700batch: iter_time=1.141e-04, loss_ctc=62.633, loss_att=47.728, acc=0.725, loss=52.189, grad_norm=5.153, loss_scale=1.000, learning_rate=6.614e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 16:13:41,230 (deepspeed_trainer:228) INFO: 37epoch:train:8701-8800batch: iter_time=1.126e-04, loss_ctc=67.081, loss_att=51.932, acc=0.718, loss=56.472, grad_norm=5.992, loss_scale=1.000, learning_rate=6.613e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:14:14,317 (deepspeed_trainer:228) INFO: 37epoch:train:8801-8900batch: iter_time=1.121e-04, loss_ctc=67.122, loss_att=46.539, acc=0.732, loss=52.733, grad_norm=6.581, loss_scale=1.000, learning_rate=6.613e-05, step_time=0.330 [2024-12-08 16:14:47,076] [INFO] [logging.py:129:log_dist] [Rank 0] step=549000, skipped=0, lr=[np.float64(6.611790027408914e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:14:47,076] [INFO] [timer.py:264:stop] epoch=0/micro_step=159000/global_step=159000, RunningAvgSamplesPerSec=49.23114826275498, CurrSamplesPerSec=51.86500333285087, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:14:47,079 (deepspeed_trainer:228) INFO: 37epoch:train:8901-9000batch: iter_time=1.093e-04, loss_ctc=64.161, loss_att=45.517, acc=0.739, loss=51.116, grad_norm=5.502, loss_scale=1.000, learning_rate=6.612e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 16:15:20,011 (deepspeed_trainer:228) INFO: 37epoch:train:9001-9100batch: iter_time=1.130e-04, loss_ctc=75.481, loss_att=52.142, acc=0.733, loss=59.134, grad_norm=6.103, loss_scale=1.000, learning_rate=6.611e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 16:15:53,133 (deepspeed_trainer:228) INFO: 37epoch:train:9101-9200batch: iter_time=1.134e-04, loss_ctc=80.218, loss_att=59.319, acc=0.721, loss=65.606, grad_norm=6.865, loss_scale=1.000, learning_rate=6.611e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:16:26,257 (deepspeed_trainer:228) INFO: 37epoch:train:9201-9300batch: iter_time=1.114e-04, loss_ctc=79.256, loss_att=58.121, acc=0.733, loss=64.472, grad_norm=5.244, loss_scale=1.000, learning_rate=6.610e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:16:55,139 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 16:17:23,211 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 16:17:39,309 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 16:17:39,309 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 16:17:39,312 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 16:18:06,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:06,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:06,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:05,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:06,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:06,379] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:06,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:07,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:08,152] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:07,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:08,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:08,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:08,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:08,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:10,275] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:10,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:56,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:57,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:57,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:57,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:57,728] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:57,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:58,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:57,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:58,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:58,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:58,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:18:59,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:00,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:00,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:01,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:01,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:44,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:44,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:46,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:46,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:46,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:46,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:46,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:46,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:48,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:48,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:49,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:49,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:49,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:49,831] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:49,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:19:50,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:33,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:33,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:35,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:35,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:36,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:37,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:36,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:36,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:37,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:39,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:39,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:40,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:40,691] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:40,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:42,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:20:43,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 16:21:19,374 (deepspeed_trainer:228) INFO: 37epoch:train:9301-9400batch: iter_time=2.540, loss_ctc=68.060, loss_att=54.177, acc=0.719, loss=58.356, grad_norm=5.658, loss_scale=1.000, learning_rate=6.610e-05, step_time=0.391 [cnode7-012:0/16] 2024-12-08 16:21:53,228 (deepspeed_trainer:228) INFO: 37epoch:train:9401-9500batch: iter_time=1.110e-04, loss_ctc=67.468, loss_att=51.042, acc=0.723, loss=55.965, grad_norm=5.450, loss_scale=1.000, learning_rate=6.609e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 16:22:26,908 (deepspeed_trainer:228) INFO: 37epoch:train:9501-9600batch: iter_time=1.198e-04, loss_ctc=82.232, loss_att=57.951, acc=0.731, loss=65.224, grad_norm=5.603, loss_scale=1.000, learning_rate=6.608e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 16:23:00,819 (deepspeed_trainer:228) INFO: 37epoch:train:9601-9700batch: iter_time=1.121e-04, loss_ctc=83.581, loss_att=65.594, acc=0.710, loss=71.020, grad_norm=6.024, loss_scale=1.000, learning_rate=6.608e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 16:23:34,553 (deepspeed_trainer:228) INFO: 37epoch:train:9701-9800batch: iter_time=1.164e-04, loss_ctc=77.219, loss_att=54.670, acc=0.723, loss=61.412, grad_norm=6.766, loss_scale=1.000, learning_rate=6.607e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 16:24:08,243 (deepspeed_trainer:228) INFO: 37epoch:train:9801-9900batch: iter_time=1.129e-04, loss_ctc=79.118, loss_att=63.152, acc=0.703, loss=67.905, grad_norm=5.832, loss_scale=1.000, learning_rate=6.607e-05, step_time=0.336 [2024-12-08 16:24:41,922] [INFO] [logging.py:129:log_dist] [Rank 0] step=550000, skipped=0, lr=[np.float64(6.605776585509452e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:24:41,923] [INFO] [timer.py:264:stop] epoch=0/micro_step=160000/global_step=160000, RunningAvgSamplesPerSec=49.225769526507364, CurrSamplesPerSec=45.86049526156327, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:24:41,925 (deepspeed_trainer:228) INFO: 37epoch:train:9901-10000batch: iter_time=1.118e-04, loss_ctc=62.831, loss_att=49.511, acc=0.730, loss=53.500, grad_norm=5.018, loss_scale=1.000, learning_rate=6.606e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 16:25:16,071 (deepspeed_trainer:228) INFO: 37epoch:train:10001-10100batch: iter_time=1.115e-04, loss_ctc=73.301, loss_att=55.859, acc=0.716, loss=61.101, grad_norm=5.717, loss_scale=1.000, learning_rate=6.605e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 16:25:49,101 (deepspeed_trainer:228) INFO: 37epoch:train:10101-10200batch: iter_time=1.096e-04, loss_ctc=67.176, loss_att=51.542, acc=0.723, loss=56.261, grad_norm=5.627, loss_scale=1.000, learning_rate=6.605e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 16:26:22,301 (deepspeed_trainer:228) INFO: 37epoch:train:10201-10300batch: iter_time=1.136e-04, loss_ctc=76.364, loss_att=54.377, acc=0.727, loss=60.958, grad_norm=5.732, loss_scale=1.000, learning_rate=6.604e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:26:55,626 (deepspeed_trainer:228) INFO: 37epoch:train:10301-10400batch: iter_time=1.127e-04, loss_ctc=74.694, loss_att=59.603, acc=0.717, loss=64.106, grad_norm=5.557, loss_scale=1.000, learning_rate=6.604e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:27:28,850 (deepspeed_trainer:228) INFO: 37epoch:train:10401-10500batch: iter_time=1.146e-04, loss_ctc=66.450, loss_att=50.469, acc=0.723, loss=55.263, grad_norm=5.905, loss_scale=1.000, learning_rate=6.603e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:28:02,032 (deepspeed_trainer:228) INFO: 37epoch:train:10501-10600batch: iter_time=1.149e-04, loss_ctc=62.928, loss_att=46.840, acc=0.727, loss=51.674, grad_norm=5.121, loss_scale=1.000, learning_rate=6.602e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:28:35,286 (deepspeed_trainer:228) INFO: 37epoch:train:10601-10700batch: iter_time=1.124e-04, loss_ctc=66.649, loss_att=47.700, acc=0.730, loss=53.376, grad_norm=5.245, loss_scale=1.000, learning_rate=6.602e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:29:08,312 (deepspeed_trainer:228) INFO: 37epoch:train:10701-10800batch: iter_time=1.130e-04, loss_ctc=65.608, loss_att=48.301, acc=0.725, loss=53.487, grad_norm=5.423, loss_scale=1.000, learning_rate=6.601e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 16:29:41,762 (deepspeed_trainer:228) INFO: 37epoch:train:10801-10900batch: iter_time=1.126e-04, loss_ctc=63.196, loss_att=43.597, acc=0.734, loss=49.489, grad_norm=5.617, loss_scale=1.000, learning_rate=6.601e-05, step_time=0.334 [2024-12-08 16:30:15,140] [INFO] [logging.py:129:log_dist] [Rank 0] step=551000, skipped=0, lr=[np.float64(6.599779521548389e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:30:15,141] [INFO] [timer.py:264:stop] epoch=0/micro_step=161000/global_step=161000, RunningAvgSamplesPerSec=49.228196994768766, CurrSamplesPerSec=51.71775384289554, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:30:15,144 (deepspeed_trainer:228) INFO: 37epoch:train:10901-11000batch: iter_time=1.160e-04, loss_ctc=76.278, loss_att=54.473, acc=0.731, loss=61.013, grad_norm=5.214, loss_scale=1.000, learning_rate=6.600e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:30:48,475 (deepspeed_trainer:228) INFO: 37epoch:train:11001-11100batch: iter_time=1.146e-04, loss_ctc=83.486, loss_att=59.779, acc=0.723, loss=66.962, grad_norm=6.942, loss_scale=1.000, learning_rate=6.599e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:31:21,621 (deepspeed_trainer:228) INFO: 37epoch:train:11101-11200batch: iter_time=1.130e-04, loss_ctc=73.888, loss_att=54.422, acc=0.726, loss=60.261, grad_norm=5.624, loss_scale=1.000, learning_rate=6.599e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:31:42,669 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 16:32:09,915 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 16:32:26,936 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 16:32:26,936 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 16:32:26,938 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 16:32:53,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:53,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:54,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:54,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:54,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:55,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:55,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:55,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:56,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:56,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:56,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:56,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:56,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:56,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:57,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:32:58,279] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:41,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:43,361] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:44,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:44,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:44,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:44,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:45,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:45,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:45,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:45,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:46,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:47,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:47,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:47,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:48,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:33:48,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:31,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:33,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:33,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:33,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:33,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:34,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:34,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:34,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:35,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:35,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:36,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:36,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:37,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:37,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:38,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:34:38,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:19,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:21,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:22,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:22,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:23,629] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:23,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:24,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:24,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:25,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:25,853] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:27,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:28,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:28,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:28,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:29,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:35:32,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 16:36:16,713 (deepspeed_trainer:228) INFO: 37epoch:train:11201-11300batch: iter_time=2.587, loss_ctc=68.097, loss_att=52.699, acc=0.721, loss=57.346, grad_norm=5.817, loss_scale=1.000, learning_rate=6.598e-05, step_time=0.364 [cnode7-012:0/16] 2024-12-08 16:36:50,243 (deepspeed_trainer:228) INFO: 37epoch:train:11301-11400batch: iter_time=1.123e-04, loss_ctc=72.285, loss_att=54.833, acc=0.731, loss=60.082, grad_norm=5.375, loss_scale=1.000, learning_rate=6.598e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 16:37:23,770 (deepspeed_trainer:228) INFO: 37epoch:train:11401-11500batch: iter_time=1.101e-04, loss_ctc=79.443, loss_att=56.892, acc=0.731, loss=63.652, grad_norm=6.065, loss_scale=1.000, learning_rate=6.597e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 16:37:57,253 (deepspeed_trainer:228) INFO: 37epoch:train:11501-11600batch: iter_time=1.096e-04, loss_ctc=83.674, loss_att=63.615, acc=0.723, loss=69.638, grad_norm=6.346, loss_scale=1.000, learning_rate=6.596e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 16:38:30,456 (deepspeed_trainer:228) INFO: 37epoch:train:11601-11700batch: iter_time=1.103e-04, loss_ctc=79.546, loss_att=57.933, acc=0.717, loss=64.407, grad_norm=6.566, loss_scale=1.000, learning_rate=6.596e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:39:03,682 (deepspeed_trainer:228) INFO: 37epoch:train:11701-11800batch: iter_time=1.102e-04, loss_ctc=73.174, loss_att=59.757, acc=0.723, loss=63.828, grad_norm=6.096, loss_scale=1.000, learning_rate=6.595e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:39:36,785 (deepspeed_trainer:228) INFO: 37epoch:train:11801-11900batch: iter_time=1.090e-04, loss_ctc=65.110, loss_att=49.240, acc=0.730, loss=53.990, grad_norm=5.605, loss_scale=1.000, learning_rate=6.595e-05, step_time=0.331 [2024-12-08 16:40:09,664] [INFO] [logging.py:129:log_dist] [Rank 0] step=552000, skipped=0, lr=[np.float64(6.59379876131677e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:40:09,664] [INFO] [timer.py:264:stop] epoch=0/micro_step=162000/global_step=162000, RunningAvgSamplesPerSec=49.22755963345681, CurrSamplesPerSec=50.61060628646859, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:40:09,667 (deepspeed_trainer:228) INFO: 37epoch:train:11901-12000batch: iter_time=1.105e-04, loss_ctc=70.979, loss_att=51.638, acc=0.728, loss=57.431, grad_norm=6.241, loss_scale=1.000, learning_rate=6.594e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 16:40:42,640 (deepspeed_trainer:228) INFO: 37epoch:train:12001-12100batch: iter_time=1.093e-04, loss_ctc=72.020, loss_att=55.243, acc=0.719, loss=60.257, grad_norm=5.802, loss_scale=1.000, learning_rate=6.593e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 16:41:15,875 (deepspeed_trainer:228) INFO: 37epoch:train:12101-12200batch: iter_time=1.169e-04, loss_ctc=70.047, loss_att=50.465, acc=0.738, loss=56.340, grad_norm=5.437, loss_scale=1.000, learning_rate=6.593e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:41:49,452 (deepspeed_trainer:228) INFO: 37epoch:train:12201-12300batch: iter_time=1.101e-04, loss_ctc=73.335, loss_att=58.952, acc=0.720, loss=63.286, grad_norm=5.527, loss_scale=1.000, learning_rate=6.592e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 16:42:22,428 (deepspeed_trainer:228) INFO: 37epoch:train:12301-12400batch: iter_time=1.141e-04, loss_ctc=71.259, loss_att=55.428, acc=0.729, loss=60.172, grad_norm=5.344, loss_scale=1.000, learning_rate=6.592e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 16:42:55,076 (deepspeed_trainer:228) INFO: 37epoch:train:12401-12500batch: iter_time=1.102e-04, loss_ctc=61.872, loss_att=47.588, acc=0.722, loss=51.880, grad_norm=5.063, loss_scale=1.000, learning_rate=6.591e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 16:43:28,584 (deepspeed_trainer:228) INFO: 37epoch:train:12501-12600batch: iter_time=1.137e-04, loss_ctc=63.730, loss_att=43.384, acc=0.746, loss=49.489, grad_norm=5.379, loss_scale=1.000, learning_rate=6.591e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 16:44:01,520 (deepspeed_trainer:228) INFO: 37epoch:train:12601-12700batch: iter_time=1.092e-04, loss_ctc=68.342, loss_att=51.258, acc=0.720, loss=56.392, grad_norm=5.882, loss_scale=1.000, learning_rate=6.590e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 16:44:34,141 (deepspeed_trainer:228) INFO: 37epoch:train:12701-12800batch: iter_time=1.082e-04, loss_ctc=66.289, loss_att=44.952, acc=0.738, loss=51.333, grad_norm=5.475, loss_scale=1.000, learning_rate=6.589e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 16:45:07,431 (deepspeed_trainer:228) INFO: 37epoch:train:12801-12900batch: iter_time=1.101e-04, loss_ctc=83.937, loss_att=62.031, acc=0.728, loss=68.615, grad_norm=6.817, loss_scale=1.000, learning_rate=6.589e-05, step_time=0.332 [2024-12-08 16:45:40,694] [INFO] [logging.py:129:log_dist] [Rank 0] step=553000, skipped=0, lr=[np.float64(6.587834231075526e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:45:40,694] [INFO] [timer.py:264:stop] epoch=0/micro_step=163000/global_step=163000, RunningAvgSamplesPerSec=49.23122527590298, CurrSamplesPerSec=46.76060399916079, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:45:40,697 (deepspeed_trainer:228) INFO: 37epoch:train:12901-13000batch: iter_time=1.091e-04, loss_ctc=78.282, loss_att=56.718, acc=0.727, loss=63.186, grad_norm=5.828, loss_scale=1.000, learning_rate=6.588e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:46:13,869 (deepspeed_trainer:228) INFO: 37epoch:train:13001-13100batch: iter_time=1.105e-04, loss_ctc=66.621, loss_att=52.639, acc=0.726, loss=56.824, grad_norm=5.284, loss_scale=1.000, learning_rate=6.588e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:46:26,488 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 16:46:54,250 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 16:47:10,749 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 16:47:10,749 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 16:47:10,751 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 16:47:34,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:35,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:36,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:36,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:36,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:37,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:37,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:38,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:38,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:38,293] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:38,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:39,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:39,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:39,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:39,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:47:39,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:24,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:25,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:26,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:27,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:27,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:28,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:28,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:28,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:28,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:29,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:29,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:30,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:30,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:30,568] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:30,757] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:48:31,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:13,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:15,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:15,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:16,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:17,290] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:17,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:17,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:18,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:19,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:19,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:19,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:20,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:20,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:21,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:22,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:49:24,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:02,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:03,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:06,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:06,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:06,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:07,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:07,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:08,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:10,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:10,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:11,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:11,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:11,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:11,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:13,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 16:50:15,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 16:51:07,906 (deepspeed_trainer:228) INFO: 37epoch:train:13101-13200batch: iter_time=2.539, loss_ctc=71.126, loss_att=54.838, acc=0.720, loss=59.718, grad_norm=5.761, loss_scale=1.000, learning_rate=6.587e-05, step_time=0.402 [cnode7-012:0/16] 2024-12-08 16:51:41,321 (deepspeed_trainer:228) INFO: 37epoch:train:13201-13300batch: iter_time=1.112e-04, loss_ctc=74.963, loss_att=52.737, acc=0.733, loss=59.403, grad_norm=6.352, loss_scale=1.000, learning_rate=6.586e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 16:52:15,237 (deepspeed_trainer:228) INFO: 37epoch:train:13301-13400batch: iter_time=1.083e-04, loss_ctc=78.019, loss_att=60.421, acc=0.725, loss=65.700, grad_norm=5.348, loss_scale=1.000, learning_rate=6.586e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 16:52:48,921 (deepspeed_trainer:228) INFO: 37epoch:train:13401-13500batch: iter_time=1.125e-04, loss_ctc=85.556, loss_att=63.608, acc=0.716, loss=70.196, grad_norm=6.612, loss_scale=1.000, learning_rate=6.585e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 16:53:22,612 (deepspeed_trainer:228) INFO: 37epoch:train:13501-13600batch: iter_time=1.095e-04, loss_ctc=78.990, loss_att=57.088, acc=0.709, loss=63.654, grad_norm=6.356, loss_scale=1.000, learning_rate=6.585e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 16:53:55,827 (deepspeed_trainer:228) INFO: 37epoch:train:13601-13700batch: iter_time=1.120e-04, loss_ctc=65.720, loss_att=54.581, acc=0.723, loss=57.922, grad_norm=5.488, loss_scale=1.000, learning_rate=6.584e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 16:54:28,801 (deepspeed_trainer:228) INFO: 37epoch:train:13701-13800batch: iter_time=1.095e-04, loss_ctc=71.139, loss_att=56.116, acc=0.715, loss=60.620, grad_norm=6.046, loss_scale=1.000, learning_rate=6.583e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 16:55:01,513 (deepspeed_trainer:228) INFO: 37epoch:train:13801-13900batch: iter_time=1.101e-04, loss_ctc=68.404, loss_att=48.968, acc=0.725, loss=54.810, grad_norm=5.946, loss_scale=1.000, learning_rate=6.583e-05, step_time=0.326 [2024-12-08 16:55:34,498] [INFO] [logging.py:129:log_dist] [Rank 0] step=554000, skipped=0, lr=[np.float64(6.58188585755166e-05)], mom=[[0.9, 0.98]] [2024-12-08 16:55:34,499] [INFO] [timer.py:264:stop] epoch=0/micro_step=164000/global_step=164000, RunningAvgSamplesPerSec=49.2262243028215, CurrSamplesPerSec=47.5659880425516, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 16:55:34,501 (deepspeed_trainer:228) INFO: 37epoch:train:13901-14000batch: iter_time=1.128e-04, loss_ctc=70.050, loss_att=51.798, acc=0.723, loss=57.272, grad_norm=5.519, loss_scale=1.000, learning_rate=6.582e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 16:56:07,671 (deepspeed_trainer:228) INFO: 37epoch:train:14001-14100batch: iter_time=1.104e-04, loss_ctc=75.651, loss_att=56.546, acc=0.735, loss=62.246, grad_norm=5.705, loss_scale=1.000, learning_rate=6.582e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:56:41,285 (deepspeed_trainer:228) INFO: 37epoch:train:14101-14200batch: iter_time=1.099e-04, loss_ctc=71.229, loss_att=55.840, acc=0.718, loss=60.438, grad_norm=6.979, loss_scale=1.000, learning_rate=6.581e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 16:57:14,376 (deepspeed_trainer:228) INFO: 37epoch:train:14201-14300batch: iter_time=1.139e-04, loss_ctc=64.521, loss_att=48.282, acc=0.729, loss=53.150, grad_norm=5.378, loss_scale=1.000, learning_rate=6.580e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 16:57:47,802 (deepspeed_trainer:228) INFO: 37epoch:train:14301-14400batch: iter_time=1.081e-04, loss_ctc=65.865, loss_att=51.133, acc=0.716, loss=55.544, grad_norm=5.484, loss_scale=1.000, learning_rate=6.580e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 16:58:21,258 (deepspeed_trainer:228) INFO: 37epoch:train:14401-14500batch: iter_time=1.113e-04, loss_ctc=69.391, loss_att=46.538, acc=0.731, loss=53.364, grad_norm=5.659, loss_scale=1.000, learning_rate=6.579e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 16:58:54,625 (deepspeed_trainer:228) INFO: 37epoch:train:14501-14600batch: iter_time=1.079e-04, loss_ctc=61.581, loss_att=45.362, acc=0.734, loss=50.254, grad_norm=5.195, loss_scale=1.000, learning_rate=6.579e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 16:59:27,993 (deepspeed_trainer:228) INFO: 37epoch:train:14601-14700batch: iter_time=1.088e-04, loss_ctc=70.127, loss_att=43.592, acc=0.743, loss=51.525, grad_norm=4.887, loss_scale=1.000, learning_rate=6.578e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:00:01,257 (deepspeed_trainer:228) INFO: 37epoch:train:14701-14800batch: iter_time=1.074e-04, loss_ctc=80.239, loss_att=63.548, acc=0.720, loss=68.541, grad_norm=6.631, loss_scale=1.000, learning_rate=6.577e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:00:34,597 (deepspeed_trainer:228) INFO: 37epoch:train:14801-14900batch: iter_time=1.086e-04, loss_ctc=81.255, loss_att=59.519, acc=0.720, loss=66.042, grad_norm=5.858, loss_scale=1.000, learning_rate=6.577e-05, step_time=0.333 [2024-12-08 17:01:07,423] [INFO] [logging.py:129:log_dist] [Rank 0] step=555000, skipped=0, lr=[np.float64(6.575953567934468e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:01:07,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=165000/global_step=165000, RunningAvgSamplesPerSec=49.22791756521425, CurrSamplesPerSec=53.2963989922643, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:01:07,426 (deepspeed_trainer:228) INFO: 37epoch:train:14901-15000batch: iter_time=1.095e-04, loss_ctc=65.276, loss_att=49.431, acc=0.730, loss=54.182, grad_norm=5.067, loss_scale=1.000, learning_rate=6.576e-05, step_time=0.328 [2024-12-08 17:01:22,485] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:22,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:22,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:22,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:23,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:23,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:23,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:23,904] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:23,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:24,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:38,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:38,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:38,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:39,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:39,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:39,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:40,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:40,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:40,215] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:40,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:41,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:41,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:41,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:40,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:40,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:41,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:53,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:54,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:54,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:54,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:56,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:56,473] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:56,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:56,497] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:56,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:57,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:57,261] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:57,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:56,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:57,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:57,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:01:57,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:08,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:10,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:10,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:11,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:10,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:11,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:11,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:11,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:12,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:12,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:12,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:13,224] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:13,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:12,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:13,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:13,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:02:26,595] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 37 is about to be saved! [2024-12-08 17:02:26,625] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/mp_rank_00_model_states.pt [2024-12-08 17:02:26,625] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/mp_rank_00_model_states.pt... [2024-12-08 17:02:28,581] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/mp_rank_00_model_states.pt. [2024-12-08 17:02:28,270] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,270] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,740] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,740] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,272] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,272] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,273] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,742] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,274] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,274] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,743] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,744] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,745] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,747] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,278] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,748] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 17:02:28,899] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 17:02:28,899] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 17:02:28,899] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,021] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,022] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,497] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,497] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,497] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,031] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,032] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,032] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,034] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,034] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,034] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,506] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,506] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,506] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,513] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,513] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,513] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,048] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,048] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,048] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,052] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,052] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,052] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,521] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,521] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,521] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,074] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,074] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,074] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,543] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,548] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,548] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,548] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,549] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,549] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,087] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,087] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,087] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,567] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,568] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,568] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [2024-12-08 17:02:29,574] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 17:02:29,574] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_37/37/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 17:02:29,574] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 37 is ready now! [cnode7-012:0/16] 2024-12-08 17:02:29,581 (deepspeed_trainer:158) INFO: 37epoch results: [train] iter_time=0.136, loss_ctc=72.862, loss_att=54.180, acc=0.724, loss=59.784, grad_norm=5.904, loss_scale=1.000, learning_rate=6.621e-05, step_time=0.335, time=1 hour, 57 minutes and 57.62 seconds, total_count=555037, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.625, cer_ctc=0.063, loss_att=5.562, acc=0.853, cer=0.338, wer=0.938, loss=4.969, time=1 minute and 9.43 seconds, total_count=37, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 17:02:31,565 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 17:02:59,188 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 17:03:15,126 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 17:03:15,126 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 17:03:15,128 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 17:03:36,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:37,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:37,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:37,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:37,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:38,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:38,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:39,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:39,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:39,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:39,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:40,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:39,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:39,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:40,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:03:40,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:24,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:25,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:25,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:26,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:27,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:26,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:27,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:27,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:27,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:27,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:28,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:28,590] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:29,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:29,942] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:30,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:04:30,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:13,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:15,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:15,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:15,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:16,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:16,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:17,080] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:16,854] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:17,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:17,312] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:18,077] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:17,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:19,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:19,541] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:19,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:05:20,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:02,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:04,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:04,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:05,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:05,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:05,444] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:06,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:05,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:06,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:06,761] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:07,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:07,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:08,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:09,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:09,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:06:10,783] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 17:07:13,244 (deepspeed_trainer:228) INFO: 38epoch:train:1-100batch: iter_time=2.467, loss_ctc=68.276, loss_att=46.700, acc=0.736, loss=53.166, grad_norm=5.733, loss_scale=1.000, learning_rate=6.576e-05, step_time=0.350 [cnode7-012:0/16] 2024-12-08 17:07:47,159 (deepspeed_trainer:228) INFO: 38epoch:train:101-200batch: iter_time=1.150e-04, loss_ctc=80.874, loss_att=63.345, acc=0.700, loss=68.618, grad_norm=8.211, loss_scale=1.000, learning_rate=6.575e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 17:08:20,483 (deepspeed_trainer:228) INFO: 38epoch:train:201-300batch: iter_time=1.124e-04, loss_ctc=73.604, loss_att=55.888, acc=0.719, loss=61.172, grad_norm=6.419, loss_scale=1.000, learning_rate=6.574e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 17:08:53,508 (deepspeed_trainer:228) INFO: 38epoch:train:301-400batch: iter_time=1.197e-04, loss_ctc=64.608, loss_att=45.818, acc=0.730, loss=51.447, grad_norm=5.475, loss_scale=1.000, learning_rate=6.574e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 17:09:26,766 (deepspeed_trainer:228) INFO: 38epoch:train:401-500batch: iter_time=1.182e-04, loss_ctc=69.722, loss_att=50.295, acc=0.723, loss=56.112, grad_norm=5.821, loss_scale=1.000, learning_rate=6.573e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:09:59,843 (deepspeed_trainer:228) INFO: 38epoch:train:501-600batch: iter_time=1.155e-04, loss_ctc=72.782, loss_att=52.773, acc=0.729, loss=58.764, grad_norm=5.511, loss_scale=1.000, learning_rate=6.573e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 17:10:32,776 (deepspeed_trainer:228) INFO: 38epoch:train:601-700batch: iter_time=1.183e-04, loss_ctc=72.412, loss_att=56.463, acc=0.718, loss=61.255, grad_norm=5.759, loss_scale=1.000, learning_rate=6.572e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 17:11:05,600 (deepspeed_trainer:228) INFO: 38epoch:train:701-800batch: iter_time=1.202e-04, loss_ctc=83.564, loss_att=58.112, acc=0.719, loss=65.734, grad_norm=5.702, loss_scale=1.000, learning_rate=6.572e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:11:38,771 (deepspeed_trainer:228) INFO: 38epoch:train:801-900batch: iter_time=1.160e-04, loss_ctc=72.906, loss_att=53.056, acc=0.711, loss=59.014, grad_norm=5.789, loss_scale=1.000, learning_rate=6.571e-05, step_time=0.331 [2024-12-08 17:12:11,711] [INFO] [logging.py:129:log_dist] [Rank 0] step=556000, skipped=0, lr=[np.float64(6.570037289871784e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:12:11,712] [INFO] [timer.py:264:stop] epoch=0/micro_step=166000/global_step=166000, RunningAvgSamplesPerSec=49.22964423850216, CurrSamplesPerSec=51.13827005036156, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:12:11,715 (deepspeed_trainer:228) INFO: 38epoch:train:901-1000batch: iter_time=1.137e-04, loss_ctc=62.098, loss_att=49.104, acc=0.721, loss=53.014, grad_norm=5.825, loss_scale=1.000, learning_rate=6.570e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 17:12:45,180 (deepspeed_trainer:228) INFO: 38epoch:train:1001-1100batch: iter_time=1.145e-04, loss_ctc=70.248, loss_att=52.156, acc=0.713, loss=57.555, grad_norm=5.508, loss_scale=1.000, learning_rate=6.570e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:13:18,194 (deepspeed_trainer:228) INFO: 38epoch:train:1101-1200batch: iter_time=1.171e-04, loss_ctc=75.968, loss_att=58.836, acc=0.716, loss=64.000, grad_norm=5.681, loss_scale=1.000, learning_rate=6.569e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 17:13:51,429 (deepspeed_trainer:228) INFO: 38epoch:train:1201-1300batch: iter_time=1.172e-04, loss_ctc=72.022, loss_att=53.376, acc=0.719, loss=59.022, grad_norm=6.570, loss_scale=1.000, learning_rate=6.569e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:14:24,875 (deepspeed_trainer:228) INFO: 38epoch:train:1301-1400batch: iter_time=1.150e-04, loss_ctc=86.380, loss_att=64.330, acc=0.702, loss=70.971, grad_norm=6.186, loss_scale=1.000, learning_rate=6.568e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:14:58,042 (deepspeed_trainer:228) INFO: 38epoch:train:1401-1500batch: iter_time=1.199e-04, loss_ctc=70.386, loss_att=54.344, acc=0.713, loss=59.158, grad_norm=5.626, loss_scale=1.000, learning_rate=6.567e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 17:15:31,140 (deepspeed_trainer:228) INFO: 38epoch:train:1501-1600batch: iter_time=1.230e-04, loss_ctc=75.184, loss_att=52.378, acc=0.714, loss=59.241, grad_norm=6.390, loss_scale=1.000, learning_rate=6.567e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 17:16:04,583 (deepspeed_trainer:228) INFO: 38epoch:train:1601-1700batch: iter_time=1.171e-04, loss_ctc=81.964, loss_att=63.211, acc=0.712, loss=68.836, grad_norm=5.976, loss_scale=1.000, learning_rate=6.566e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:16:37,659 (deepspeed_trainer:228) INFO: 38epoch:train:1701-1800batch: iter_time=1.204e-04, loss_ctc=63.834, loss_att=50.947, acc=0.711, loss=54.783, grad_norm=5.550, loss_scale=1.000, learning_rate=6.566e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 17:17:06,618 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 17:17:33,366 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 17:17:49,251 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 17:17:49,251 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 17:17:49,256 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 17:18:20,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:20,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:20,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:20,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,151] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:21,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:18:22,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:11,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:11,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:13,435] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:12,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:12,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:12,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:14,159] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:14,230] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:14,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:13,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:13,554] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:13,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:15,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:15,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:15,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:15,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:19:59,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:00,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:01,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:01,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:02,341] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:03,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:02,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:04,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:02,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:04,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:04,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:04,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:03,856] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:04,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:06,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:08,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:47,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:49,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:50,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:51,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:52,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:52,287] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:52,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:51,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:53,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:53,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:54,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:54,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:56,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:55,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:57,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:20:57,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 17:21:34,952 (deepspeed_trainer:228) INFO: 38epoch:train:1801-1900batch: iter_time=2.579, loss_ctc=66.185, loss_att=47.131, acc=0.733, loss=52.876, grad_norm=5.233, loss_scale=1.000, learning_rate=6.565e-05, step_time=0.379 [2024-12-08 17:22:08,949] [INFO] [logging.py:129:log_dist] [Rank 0] step=557000, skipped=0, lr=[np.float64(6.564136951466292e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:22:08,952] [INFO] [timer.py:264:stop] epoch=0/micro_step=167000/global_step=167000, RunningAvgSamplesPerSec=49.227396883847405, CurrSamplesPerSec=50.599845105165706, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:22:08,955 (deepspeed_trainer:228) INFO: 38epoch:train:1901-2000batch: iter_time=1.139e-04, loss_ctc=70.137, loss_att=53.859, acc=0.727, loss=58.706, grad_norm=5.870, loss_scale=1.000, learning_rate=6.564e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 17:22:42,627 (deepspeed_trainer:228) INFO: 38epoch:train:2001-2100batch: iter_time=1.188e-04, loss_ctc=79.968, loss_att=60.906, acc=0.719, loss=66.631, grad_norm=7.870, loss_scale=1.000, learning_rate=6.564e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 17:23:16,043 (deepspeed_trainer:228) INFO: 38epoch:train:2101-2200batch: iter_time=1.196e-04, loss_ctc=71.001, loss_att=54.083, acc=0.727, loss=59.159, grad_norm=5.712, loss_scale=1.000, learning_rate=6.563e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:23:49,509 (deepspeed_trainer:228) INFO: 38epoch:train:2201-2300batch: iter_time=1.161e-04, loss_ctc=63.343, loss_att=45.809, acc=0.736, loss=51.067, grad_norm=5.045, loss_scale=1.000, learning_rate=6.563e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:24:22,926 (deepspeed_trainer:228) INFO: 38epoch:train:2301-2400batch: iter_time=1.178e-04, loss_ctc=67.447, loss_att=50.372, acc=0.729, loss=55.513, grad_norm=5.945, loss_scale=1.000, learning_rate=6.562e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:24:56,531 (deepspeed_trainer:228) INFO: 38epoch:train:2401-2500batch: iter_time=1.170e-04, loss_ctc=73.881, loss_att=57.677, acc=0.727, loss=62.529, grad_norm=5.684, loss_scale=1.000, learning_rate=6.561e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 17:25:29,892 (deepspeed_trainer:228) INFO: 38epoch:train:2501-2600batch: iter_time=1.220e-04, loss_ctc=73.342, loss_att=51.877, acc=0.734, loss=58.340, grad_norm=5.264, loss_scale=1.000, learning_rate=6.561e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 17:26:03,045 (deepspeed_trainer:228) INFO: 38epoch:train:2601-2700batch: iter_time=1.180e-04, loss_ctc=80.313, loss_att=57.701, acc=0.716, loss=64.487, grad_norm=6.359, loss_scale=1.000, learning_rate=6.560e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 17:26:36,344 (deepspeed_trainer:228) INFO: 38epoch:train:2701-2800batch: iter_time=1.203e-04, loss_ctc=70.874, loss_att=53.514, acc=0.718, loss=58.734, grad_norm=6.088, loss_scale=1.000, learning_rate=6.560e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 17:27:09,343 (deepspeed_trainer:228) INFO: 38epoch:train:2801-2900batch: iter_time=1.187e-04, loss_ctc=59.619, loss_att=47.174, acc=0.728, loss=50.884, grad_norm=5.285, loss_scale=1.000, learning_rate=6.559e-05, step_time=0.330 [2024-12-08 17:27:42,741] [INFO] [logging.py:129:log_dist] [Rank 0] step=558000, skipped=0, lr=[np.float64(6.558252481271839e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:27:42,742] [INFO] [timer.py:264:stop] epoch=0/micro_step=168000/global_step=168000, RunningAvgSamplesPerSec=49.22849914680417, CurrSamplesPerSec=48.11601862071162, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:27:42,746 (deepspeed_trainer:228) INFO: 38epoch:train:2901-3000batch: iter_time=1.169e-04, loss_ctc=72.213, loss_att=54.877, acc=0.721, loss=60.117, grad_norm=5.872, loss_scale=1.000, learning_rate=6.559e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 17:28:15,976 (deepspeed_trainer:228) INFO: 38epoch:train:3001-3100batch: iter_time=1.202e-04, loss_ctc=70.882, loss_att=54.407, acc=0.720, loss=59.333, grad_norm=6.032, loss_scale=1.000, learning_rate=6.558e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:28:49,380 (deepspeed_trainer:228) INFO: 38epoch:train:3101-3200batch: iter_time=1.165e-04, loss_ctc=82.517, loss_att=62.433, acc=0.715, loss=68.459, grad_norm=6.803, loss_scale=1.000, learning_rate=6.557e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 17:29:22,824 (deepspeed_trainer:228) INFO: 38epoch:train:3201-3300batch: iter_time=1.169e-04, loss_ctc=73.918, loss_att=56.367, acc=0.723, loss=61.619, grad_norm=5.644, loss_scale=1.000, learning_rate=6.557e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:29:53,797 (deepspeed_trainer:228) INFO: 38epoch:train:3301-3400batch: iter_time=1.172e-04, loss_ctc=73.714, loss_att=54.229, acc=0.724, loss=60.092, grad_norm=5.726, loss_scale=1.000, learning_rate=6.556e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 17:30:27,312 (deepspeed_trainer:228) INFO: 38epoch:train:3401-3500batch: iter_time=1.174e-04, loss_ctc=72.653, loss_att=55.008, acc=0.715, loss=60.288, grad_norm=6.172, loss_scale=1.000, learning_rate=6.556e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 17:31:00,852 (deepspeed_trainer:228) INFO: 38epoch:train:3501-3600batch: iter_time=1.189e-04, loss_ctc=80.250, loss_att=64.314, acc=0.711, loss=69.098, grad_norm=6.352, loss_scale=1.000, learning_rate=6.555e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 17:31:34,102 (deepspeed_trainer:228) INFO: 38epoch:train:3601-3700batch: iter_time=1.214e-04, loss_ctc=66.935, loss_att=50.817, acc=0.719, loss=55.651, grad_norm=5.719, loss_scale=1.000, learning_rate=6.554e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:31:54,503 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 17:32:21,326 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 17:32:39,366 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 17:32:39,366 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 17:32:39,368 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 17:33:05,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:05,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:06,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:06,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:05,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:07,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:06,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:07,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:07,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:06,968] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:07,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:07,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:08,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:08,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:08,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:10,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:55,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:55,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:55,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:56,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:55,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:57,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:56,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:56,736] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:58,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:57,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:58,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:58,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:58,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:58,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:33:58,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:00,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:42,762] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:43,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:44,917] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:45,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:45,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:47,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:47,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:46,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:46,690] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:47,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:47,907] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:49,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:48,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:48,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:50,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:34:51,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:30,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:31,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:32,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:32,953] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:35,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:35,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:34,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:34,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:35,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:36,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:36,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:37,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:36,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:38,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:40,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:35:41,441] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 17:36:26,416 (deepspeed_trainer:228) INFO: 38epoch:train:3701-3800batch: iter_time=2.575, loss_ctc=64.870, loss_att=45.871, acc=0.741, loss=51.574, grad_norm=6.416, loss_scale=1.000, learning_rate=6.554e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 17:37:00,280 (deepspeed_trainer:228) INFO: 38epoch:train:3801-3900batch: iter_time=1.201e-04, loss_ctc=73.672, loss_att=60.175, acc=0.713, loss=64.205, grad_norm=7.126, loss_scale=1.000, learning_rate=6.553e-05, step_time=0.338 [2024-12-08 17:37:34,316] [INFO] [logging.py:129:log_dist] [Rank 0] step=559000, skipped=0, lr=[np.float64(6.552383808289814e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:37:34,317] [INFO] [timer.py:264:stop] epoch=0/micro_step=169000/global_step=169000, RunningAvgSamplesPerSec=49.22939249966274, CurrSamplesPerSec=49.55136777841355, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:37:34,319 (deepspeed_trainer:228) INFO: 38epoch:train:3901-4000batch: iter_time=1.107e-04, loss_ctc=79.066, loss_att=57.975, acc=0.724, loss=64.283, grad_norm=6.998, loss_scale=1.000, learning_rate=6.553e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 17:38:08,000 (deepspeed_trainer:228) INFO: 38epoch:train:4001-4100batch: iter_time=1.127e-04, loss_ctc=62.171, loss_att=47.090, acc=0.740, loss=51.615, grad_norm=5.199, loss_scale=1.000, learning_rate=6.552e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 17:38:41,617 (deepspeed_trainer:228) INFO: 38epoch:train:4101-4200batch: iter_time=1.105e-04, loss_ctc=66.123, loss_att=45.800, acc=0.734, loss=51.889, grad_norm=5.218, loss_scale=1.000, learning_rate=6.552e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 17:39:15,470 (deepspeed_trainer:228) INFO: 38epoch:train:4201-4300batch: iter_time=1.098e-04, loss_ctc=73.153, loss_att=53.082, acc=0.734, loss=59.112, grad_norm=5.721, loss_scale=1.000, learning_rate=6.551e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 17:39:49,281 (deepspeed_trainer:228) INFO: 38epoch:train:4301-4400batch: iter_time=1.086e-04, loss_ctc=67.007, loss_att=52.877, acc=0.736, loss=57.122, grad_norm=5.311, loss_scale=1.000, learning_rate=6.550e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 17:40:22,955 (deepspeed_trainer:228) INFO: 38epoch:train:4401-4500batch: iter_time=1.114e-04, loss_ctc=73.295, loss_att=53.542, acc=0.728, loss=59.437, grad_norm=5.556, loss_scale=1.000, learning_rate=6.550e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 17:40:57,274 (deepspeed_trainer:228) INFO: 38epoch:train:4501-4600batch: iter_time=1.123e-04, loss_ctc=81.167, loss_att=58.899, acc=0.715, loss=65.593, grad_norm=5.580, loss_scale=1.000, learning_rate=6.549e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 17:41:30,990 (deepspeed_trainer:228) INFO: 38epoch:train:4601-4700batch: iter_time=1.129e-04, loss_ctc=66.880, loss_att=49.539, acc=0.729, loss=54.739, grad_norm=5.385, loss_scale=1.000, learning_rate=6.549e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 17:42:04,604 (deepspeed_trainer:228) INFO: 38epoch:train:4701-4800batch: iter_time=1.137e-04, loss_ctc=61.179, loss_att=47.594, acc=0.723, loss=51.680, grad_norm=5.438, loss_scale=1.000, learning_rate=6.548e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 17:42:38,633 (deepspeed_trainer:228) INFO: 38epoch:train:4801-4900batch: iter_time=1.203e-04, loss_ctc=76.131, loss_att=58.695, acc=0.726, loss=63.911, grad_norm=5.821, loss_scale=1.000, learning_rate=6.547e-05, step_time=0.340 [2024-12-08 17:43:12,511] [INFO] [logging.py:129:log_dist] [Rank 0] step=560000, skipped=0, lr=[np.float64(6.546530861965541e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:43:12,512] [INFO] [timer.py:264:stop] epoch=0/micro_step=170000/global_step=170000, RunningAvgSamplesPerSec=49.22634276851445, CurrSamplesPerSec=47.11059549919611, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:43:12,515 (deepspeed_trainer:228) INFO: 38epoch:train:4901-5000batch: iter_time=1.148e-04, loss_ctc=72.414, loss_att=60.155, acc=0.716, loss=63.869, grad_norm=5.756, loss_scale=1.000, learning_rate=6.547e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 17:43:46,791 (deepspeed_trainer:228) INFO: 38epoch:train:5001-5100batch: iter_time=1.121e-04, loss_ctc=78.765, loss_att=55.593, acc=0.720, loss=62.539, grad_norm=6.666, loss_scale=1.000, learning_rate=6.546e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 17:44:20,799 (deepspeed_trainer:228) INFO: 38epoch:train:5101-5200batch: iter_time=1.152e-04, loss_ctc=73.580, loss_att=56.365, acc=0.720, loss=61.552, grad_norm=5.941, loss_scale=1.000, learning_rate=6.546e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 17:44:54,579 (deepspeed_trainer:228) INFO: 38epoch:train:5201-5300batch: iter_time=1.149e-04, loss_ctc=76.404, loss_att=55.759, acc=0.723, loss=61.970, grad_norm=6.290, loss_scale=1.000, learning_rate=6.545e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 17:45:28,293 (deepspeed_trainer:228) INFO: 38epoch:train:5301-5400batch: iter_time=1.129e-04, loss_ctc=70.927, loss_att=56.913, acc=0.718, loss=61.088, grad_norm=5.765, loss_scale=1.000, learning_rate=6.544e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 17:46:02,952 (deepspeed_trainer:228) INFO: 38epoch:train:5401-5500batch: iter_time=1.124e-04, loss_ctc=74.196, loss_att=57.359, acc=0.717, loss=62.388, grad_norm=5.866, loss_scale=1.000, learning_rate=6.544e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 17:46:35,747 (deepspeed_trainer:228) INFO: 38epoch:train:5501-5600batch: iter_time=1.133e-04, loss_ctc=66.432, loss_att=48.631, acc=0.730, loss=53.981, grad_norm=5.544, loss_scale=1.000, learning_rate=6.543e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 17:46:47,709 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 17:47:15,242 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 17:47:32,270 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 17:47:32,270 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 17:47:32,272 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 17:47:55,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:56,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:56,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:57,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:59,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:58,100] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:59,837] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:58,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:58,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:00,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:00,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:47:59,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:00,816] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:01,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:01,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:01,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:44,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:45,957] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:46,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:47,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:47,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:48,483] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:48,584] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:50,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:49,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:51,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:52,007] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:52,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:52,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:52,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:52,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:48:54,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:31,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:33,927] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:35,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:35,769] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:36,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:36,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:37,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:39,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:40,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:41,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:41,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:42,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:42,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:41,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:42,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:49:42,940] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:18,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:21,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:22,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:24,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:24,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:25,329] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:25,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:29,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:31,384] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:31,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:32,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:31,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:32,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:32,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:33,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 17:50:33,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 17:51:26,645 (deepspeed_trainer:228) INFO: 38epoch:train:5601-5700batch: iter_time=2.564, loss_ctc=65.091, loss_att=45.927, acc=0.739, loss=51.682, grad_norm=5.698, loss_scale=1.000, learning_rate=6.543e-05, step_time=0.344 [cnode7-012:0/16] 2024-12-08 17:52:00,326 (deepspeed_trainer:228) INFO: 38epoch:train:5701-5800batch: iter_time=1.077e-04, loss_ctc=71.881, loss_att=59.506, acc=0.714, loss=63.237, grad_norm=6.683, loss_scale=1.000, learning_rate=6.542e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 17:52:33,712 (deepspeed_trainer:228) INFO: 38epoch:train:5801-5900batch: iter_time=1.063e-04, loss_ctc=82.892, loss_att=62.025, acc=0.727, loss=68.291, grad_norm=6.811, loss_scale=1.000, learning_rate=6.542e-05, step_time=0.334 [2024-12-08 17:53:07,578] [INFO] [logging.py:129:log_dist] [Rank 0] step=561000, skipped=0, lr=[np.float64(6.540693572184722e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:53:07,579] [INFO] [timer.py:264:stop] epoch=0/micro_step=171000/global_step=171000, RunningAvgSamplesPerSec=49.223422812592204, CurrSamplesPerSec=51.745069982123646, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:53:07,581 (deepspeed_trainer:228) INFO: 38epoch:train:5901-6000batch: iter_time=1.303e-04, loss_ctc=62.583, loss_att=44.882, acc=0.739, loss=50.198, grad_norm=5.815, loss_scale=1.000, learning_rate=6.541e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 17:53:41,002 (deepspeed_trainer:228) INFO: 38epoch:train:6001-6100batch: iter_time=1.087e-04, loss_ctc=67.117, loss_att=46.874, acc=0.734, loss=52.938, grad_norm=5.194, loss_scale=1.000, learning_rate=6.540e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:54:14,390 (deepspeed_trainer:228) INFO: 38epoch:train:6101-6200batch: iter_time=1.101e-04, loss_ctc=70.266, loss_att=50.662, acc=0.735, loss=56.555, grad_norm=5.366, loss_scale=1.000, learning_rate=6.540e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 17:54:47,650 (deepspeed_trainer:228) INFO: 38epoch:train:6201-6300batch: iter_time=1.078e-04, loss_ctc=67.925, loss_att=53.627, acc=0.738, loss=57.929, grad_norm=5.329, loss_scale=1.000, learning_rate=6.539e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:55:20,910 (deepspeed_trainer:228) INFO: 38epoch:train:6301-6400batch: iter_time=1.064e-04, loss_ctc=83.593, loss_att=58.563, acc=0.723, loss=66.034, grad_norm=5.886, loss_scale=1.000, learning_rate=6.539e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:55:54,082 (deepspeed_trainer:228) INFO: 38epoch:train:6401-6500batch: iter_time=1.111e-04, loss_ctc=76.138, loss_att=59.552, acc=0.710, loss=64.536, grad_norm=5.785, loss_scale=1.000, learning_rate=6.538e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 17:56:26,760 (deepspeed_trainer:228) INFO: 38epoch:train:6501-6600batch: iter_time=1.122e-04, loss_ctc=58.287, loss_att=43.753, acc=0.734, loss=48.112, grad_norm=5.314, loss_scale=1.000, learning_rate=6.537e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 17:56:59,683 (deepspeed_trainer:228) INFO: 38epoch:train:6601-6700batch: iter_time=1.142e-04, loss_ctc=65.282, loss_att=49.742, acc=0.732, loss=54.370, grad_norm=5.284, loss_scale=1.000, learning_rate=6.537e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 17:57:32,906 (deepspeed_trainer:228) INFO: 38epoch:train:6701-6800batch: iter_time=1.093e-04, loss_ctc=76.421, loss_att=59.246, acc=0.721, loss=64.428, grad_norm=5.834, loss_scale=1.000, learning_rate=6.536e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:58:06,105 (deepspeed_trainer:228) INFO: 38epoch:train:6801-6900batch: iter_time=1.112e-04, loss_ctc=70.171, loss_att=56.402, acc=0.718, loss=60.537, grad_norm=6.276, loss_scale=1.000, learning_rate=6.536e-05, step_time=0.332 [2024-12-08 17:58:39,241] [INFO] [logging.py:129:log_dist] [Rank 0] step=562000, skipped=0, lr=[np.float64(6.534871869269909e-05)], mom=[[0.9, 0.98]] [2024-12-08 17:58:39,242] [INFO] [timer.py:264:stop] epoch=0/micro_step=172000/global_step=172000, RunningAvgSamplesPerSec=49.22630939490751, CurrSamplesPerSec=48.47140534213845, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 17:58:39,245 (deepspeed_trainer:228) INFO: 38epoch:train:6901-7000batch: iter_time=1.151e-04, loss_ctc=81.946, loss_att=57.958, acc=0.716, loss=65.127, grad_norm=5.765, loss_scale=1.000, learning_rate=6.535e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 17:59:12,450 (deepspeed_trainer:228) INFO: 38epoch:train:7001-7100batch: iter_time=1.085e-04, loss_ctc=68.097, loss_att=54.910, acc=0.725, loss=58.873, grad_norm=5.335, loss_scale=1.000, learning_rate=6.535e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 17:59:45,835 (deepspeed_trainer:228) INFO: 38epoch:train:7101-7200batch: iter_time=1.127e-04, loss_ctc=76.234, loss_att=53.625, acc=0.719, loss=60.422, grad_norm=6.305, loss_scale=1.000, learning_rate=6.534e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:00:19,895 (deepspeed_trainer:228) INFO: 38epoch:train:7201-7300batch: iter_time=1.137e-04, loss_ctc=74.565, loss_att=59.778, acc=0.724, loss=64.199, grad_norm=6.524, loss_scale=1.000, learning_rate=6.533e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 18:00:52,957 (deepspeed_trainer:228) INFO: 38epoch:train:7301-7400batch: iter_time=1.143e-04, loss_ctc=68.756, loss_att=52.272, acc=0.724, loss=57.218, grad_norm=6.146, loss_scale=1.000, learning_rate=6.533e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:01:26,005 (deepspeed_trainer:228) INFO: 38epoch:train:7401-7500batch: iter_time=1.105e-04, loss_ctc=69.951, loss_att=53.216, acc=0.722, loss=58.230, grad_norm=5.973, loss_scale=1.000, learning_rate=6.532e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:01:29,730 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 18:01:58,329 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 18:02:14,314 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 18:02:14,314 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 18:02:14,317 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 18:02:39,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:39,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:39,768] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:41,213] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:39,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:41,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:40,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:40,878] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:40,961] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:40,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:42,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:42,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:43,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:43,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:43,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:02:43,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:28,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:28,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:28,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:29,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:29,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:31,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:29,849] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:30,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:30,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:32,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:33,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:33,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:33,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:33,745] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:34,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:03:36,596] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:16,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:17,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:18,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:18,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:18,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:19,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:21,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:20,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:20,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:21,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:22,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:24,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:24,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:25,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:25,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:04:29,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:05,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:07,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:07,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:08,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:09,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:08,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:10,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:09,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:10,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:11,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:12,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:11,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:13,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:13,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:13,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:05:19,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 18:06:21,003 (deepspeed_trainer:228) INFO: 38epoch:train:7501-7600batch: iter_time=2.572, loss_ctc=66.871, loss_att=46.421, acc=0.741, loss=52.552, grad_norm=5.741, loss_scale=1.000, learning_rate=6.532e-05, step_time=0.377 [cnode7-012:0/16] 2024-12-08 18:06:55,094 (deepspeed_trainer:228) INFO: 38epoch:train:7601-7700batch: iter_time=1.151e-04, loss_ctc=74.053, loss_att=62.772, acc=0.708, loss=66.166, grad_norm=7.539, loss_scale=1.000, learning_rate=6.531e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 18:07:29,029 (deepspeed_trainer:228) INFO: 38epoch:train:7701-7800batch: iter_time=1.076e-04, loss_ctc=72.363, loss_att=54.813, acc=0.724, loss=60.073, grad_norm=6.050, loss_scale=1.000, learning_rate=6.531e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 18:08:02,077 (deepspeed_trainer:228) INFO: 38epoch:train:7801-7900batch: iter_time=1.085e-04, loss_ctc=63.250, loss_att=45.739, acc=0.734, loss=51.009, grad_norm=5.144, loss_scale=1.000, learning_rate=6.530e-05, step_time=0.330 [2024-12-08 18:08:35,548] [INFO] [logging.py:129:log_dist] [Rank 0] step=563000, skipped=0, lr=[np.float64(6.529065683977007e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:08:35,549] [INFO] [timer.py:264:stop] epoch=0/micro_step=173000/global_step=173000, RunningAvgSamplesPerSec=49.2226976852052, CurrSamplesPerSec=51.4756481724534, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:08:35,551 (deepspeed_trainer:228) INFO: 38epoch:train:7901-8000batch: iter_time=1.111e-04, loss_ctc=68.012, loss_att=50.186, acc=0.726, loss=55.554, grad_norm=5.613, loss_scale=1.000, learning_rate=6.529e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 18:09:09,124 (deepspeed_trainer:228) INFO: 38epoch:train:8001-8100batch: iter_time=1.115e-04, loss_ctc=72.433, loss_att=52.521, acc=0.732, loss=58.502, grad_norm=6.037, loss_scale=1.000, learning_rate=6.529e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:09:42,168 (deepspeed_trainer:228) INFO: 38epoch:train:8101-8200batch: iter_time=1.086e-04, loss_ctc=70.813, loss_att=55.632, acc=0.722, loss=60.185, grad_norm=5.725, loss_scale=1.000, learning_rate=6.528e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:10:15,530 (deepspeed_trainer:228) INFO: 38epoch:train:8201-8300batch: iter_time=1.110e-04, loss_ctc=80.122, loss_att=57.797, acc=0.724, loss=64.499, grad_norm=5.465, loss_scale=1.000, learning_rate=6.528e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:10:48,742 (deepspeed_trainer:228) INFO: 38epoch:train:8301-8400batch: iter_time=1.111e-04, loss_ctc=71.004, loss_att=52.797, acc=0.714, loss=58.266, grad_norm=6.256, loss_scale=1.000, learning_rate=6.527e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:11:21,834 (deepspeed_trainer:228) INFO: 38epoch:train:8401-8500batch: iter_time=1.123e-04, loss_ctc=61.028, loss_att=49.042, acc=0.725, loss=52.630, grad_norm=5.944, loss_scale=1.000, learning_rate=6.526e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:11:54,638 (deepspeed_trainer:228) INFO: 38epoch:train:8501-8600batch: iter_time=1.095e-04, loss_ctc=68.542, loss_att=51.364, acc=0.717, loss=56.532, grad_norm=5.683, loss_scale=1.000, learning_rate=6.526e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 18:12:27,774 (deepspeed_trainer:228) INFO: 38epoch:train:8601-8700batch: iter_time=1.107e-04, loss_ctc=74.262, loss_att=58.523, acc=0.717, loss=63.214, grad_norm=6.025, loss_scale=1.000, learning_rate=6.525e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:13:00,850 (deepspeed_trainer:228) INFO: 38epoch:train:8701-8800batch: iter_time=1.112e-04, loss_ctc=69.279, loss_att=52.738, acc=0.723, loss=57.703, grad_norm=6.532, loss_scale=1.000, learning_rate=6.525e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:13:34,316 (deepspeed_trainer:228) INFO: 38epoch:train:8801-8900batch: iter_time=1.090e-04, loss_ctc=84.567, loss_att=63.785, acc=0.706, loss=70.035, grad_norm=6.012, loss_scale=1.000, learning_rate=6.524e-05, step_time=0.334 [2024-12-08 18:14:07,287] [INFO] [logging.py:129:log_dist] [Rank 0] step=564000, skipped=0, lr=[np.float64(6.52327494749182e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:14:07,288] [INFO] [timer.py:264:stop] epoch=0/micro_step=174000/global_step=174000, RunningAvgSamplesPerSec=49.22530751364084, CurrSamplesPerSec=49.99791711662932, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:14:07,291 (deepspeed_trainer:228) INFO: 38epoch:train:8901-9000batch: iter_time=1.084e-04, loss_ctc=68.990, loss_att=53.965, acc=0.718, loss=58.488, grad_norm=5.320, loss_scale=1.000, learning_rate=6.524e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 18:14:40,376 (deepspeed_trainer:228) INFO: 38epoch:train:9001-9100batch: iter_time=1.072e-04, loss_ctc=73.561, loss_att=51.389, acc=0.721, loss=58.042, grad_norm=6.453, loss_scale=1.000, learning_rate=6.523e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:15:13,763 (deepspeed_trainer:228) INFO: 38epoch:train:9101-9200batch: iter_time=1.119e-04, loss_ctc=80.535, loss_att=63.398, acc=0.713, loss=68.547, grad_norm=6.202, loss_scale=1.000, learning_rate=6.522e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 18:15:46,626 (deepspeed_trainer:228) INFO: 38epoch:train:9201-9300batch: iter_time=1.103e-04, loss_ctc=62.657, loss_att=49.948, acc=0.715, loss=53.799, grad_norm=5.565, loss_scale=1.000, learning_rate=6.522e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 18:16:15,405 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 18:16:43,215 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 18:16:59,632 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 18:16:59,632 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 18:16:59,634 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 18:17:23,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:25,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:27,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:28,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:26,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:26,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:26,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:28,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:28,778] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:27,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:28,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:27,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:27,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:29,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:29,454] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:17:30,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:11,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:14,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:15,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:17,046] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:15,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:16,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:16,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:17,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:16,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:18,424] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:18,820] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:19,128] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:19,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:20,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:21,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:22,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:18:58,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:04,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:06,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:04,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:05,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:05,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:05,866] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:07,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:06,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:08,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:08,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:09,370] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:10,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:10,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:10,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:13,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:47,137] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:53,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:53,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:55,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:55,295] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:55,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:55,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:57,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:56,399] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:58,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:19:58,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:20:00,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:20:01,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:20:00,277] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:20:01,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:20:03,949] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 18:20:38,325 (deepspeed_trainer:228) INFO: 38epoch:train:9301-9400batch: iter_time=2.585, loss_ctc=65.691, loss_att=46.791, acc=0.736, loss=52.446, grad_norm=5.481, loss_scale=1.000, learning_rate=6.521e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:21:11,873 (deepspeed_trainer:228) INFO: 38epoch:train:9401-9500batch: iter_time=1.125e-04, loss_ctc=68.729, loss_att=52.938, acc=0.734, loss=57.670, grad_norm=5.811, loss_scale=1.000, learning_rate=6.521e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:21:45,253 (deepspeed_trainer:228) INFO: 38epoch:train:9501-9600batch: iter_time=1.054e-04, loss_ctc=77.936, loss_att=60.347, acc=0.722, loss=65.635, grad_norm=7.616, loss_scale=1.000, learning_rate=6.520e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:22:19,003 (deepspeed_trainer:228) INFO: 38epoch:train:9601-9700batch: iter_time=1.095e-04, loss_ctc=70.461, loss_att=53.336, acc=0.731, loss=58.463, grad_norm=6.052, loss_scale=1.000, learning_rate=6.520e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 18:22:51,191 (deepspeed_trainer:228) INFO: 38epoch:train:9701-9800batch: iter_time=1.115e-04, loss_ctc=62.932, loss_att=44.683, acc=0.742, loss=50.157, grad_norm=5.461, loss_scale=1.000, learning_rate=6.519e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 18:23:24,716 (deepspeed_trainer:228) INFO: 38epoch:train:9801-9900batch: iter_time=1.090e-04, loss_ctc=66.404, loss_att=50.036, acc=0.735, loss=54.957, grad_norm=5.802, loss_scale=1.000, learning_rate=6.518e-05, step_time=0.335 [2024-12-08 18:23:58,391] [INFO] [logging.py:129:log_dist] [Rank 0] step=565000, skipped=0, lr=[np.float64(6.51749959142662e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:23:58,392] [INFO] [timer.py:264:stop] epoch=0/micro_step=175000/global_step=175000, RunningAvgSamplesPerSec=49.22762727962947, CurrSamplesPerSec=49.02697185073459, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:23:58,395 (deepspeed_trainer:228) INFO: 38epoch:train:9901-10000batch: iter_time=1.108e-04, loss_ctc=72.969, loss_att=57.073, acc=0.731, loss=61.847, grad_norm=5.616, loss_scale=1.000, learning_rate=6.518e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 18:24:32,671 (deepspeed_trainer:228) INFO: 38epoch:train:10001-10100batch: iter_time=1.139e-04, loss_ctc=72.605, loss_att=51.126, acc=0.738, loss=57.589, grad_norm=5.460, loss_scale=1.000, learning_rate=6.517e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-08 18:25:06,243 (deepspeed_trainer:228) INFO: 38epoch:train:10101-10200batch: iter_time=1.107e-04, loss_ctc=78.004, loss_att=56.547, acc=0.720, loss=63.002, grad_norm=6.200, loss_scale=1.000, learning_rate=6.517e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:25:39,891 (deepspeed_trainer:228) INFO: 38epoch:train:10201-10300batch: iter_time=1.140e-04, loss_ctc=69.974, loss_att=52.766, acc=0.722, loss=57.928, grad_norm=5.211, loss_scale=1.000, learning_rate=6.516e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 18:26:13,115 (deepspeed_trainer:228) INFO: 38epoch:train:10301-10400batch: iter_time=1.099e-04, loss_ctc=59.021, loss_att=46.435, acc=0.732, loss=50.194, grad_norm=5.454, loss_scale=1.000, learning_rate=6.515e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:26:46,841 (deepspeed_trainer:228) INFO: 38epoch:train:10401-10500batch: iter_time=1.096e-04, loss_ctc=71.873, loss_att=54.958, acc=0.722, loss=60.061, grad_norm=6.876, loss_scale=1.000, learning_rate=6.515e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 18:27:20,321 (deepspeed_trainer:228) INFO: 38epoch:train:10501-10600batch: iter_time=1.136e-04, loss_ctc=69.672, loss_att=53.527, acc=0.722, loss=58.351, grad_norm=5.564, loss_scale=1.000, learning_rate=6.514e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:27:53,757 (deepspeed_trainer:228) INFO: 38epoch:train:10601-10700batch: iter_time=1.128e-04, loss_ctc=80.470, loss_att=62.372, acc=0.719, loss=67.817, grad_norm=6.414, loss_scale=1.000, learning_rate=6.514e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 18:28:26,900 (deepspeed_trainer:228) INFO: 38epoch:train:10701-10800batch: iter_time=1.224e-04, loss_ctc=73.459, loss_att=56.168, acc=0.723, loss=61.379, grad_norm=5.562, loss_scale=1.000, learning_rate=6.513e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:29:00,106 (deepspeed_trainer:228) INFO: 38epoch:train:10801-10900batch: iter_time=1.111e-04, loss_ctc=72.686, loss_att=52.845, acc=0.730, loss=58.795, grad_norm=6.347, loss_scale=1.000, learning_rate=6.513e-05, step_time=0.331 [2024-12-08 18:29:33,137] [INFO] [logging.py:129:log_dist] [Rank 0] step=566000, skipped=0, lr=[np.float64(6.51173954781676e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:29:33,138] [INFO] [timer.py:264:stop] epoch=0/micro_step=176000/global_step=176000, RunningAvgSamplesPerSec=49.22844099012399, CurrSamplesPerSec=51.329891415574835, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:29:33,141 (deepspeed_trainer:228) INFO: 38epoch:train:10901-11000batch: iter_time=1.135e-04, loss_ctc=70.874, loss_att=53.936, acc=0.720, loss=59.023, grad_norm=6.374, loss_scale=1.000, learning_rate=6.512e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:30:06,641 (deepspeed_trainer:228) INFO: 38epoch:train:11001-11100batch: iter_time=1.096e-04, loss_ctc=79.977, loss_att=63.344, acc=0.716, loss=68.334, grad_norm=5.939, loss_scale=1.000, learning_rate=6.511e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:30:39,537 (deepspeed_trainer:228) INFO: 38epoch:train:11101-11200batch: iter_time=1.096e-04, loss_ctc=65.926, loss_att=50.798, acc=0.723, loss=55.342, grad_norm=5.808, loss_scale=1.000, learning_rate=6.511e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 18:30:59,964 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 18:31:27,187 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 18:31:43,615 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 18:31:43,616 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 18:31:43,618 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 18:32:11,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:11,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:11,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:11,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:11,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:12,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:12,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:13,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:12,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:13,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:13,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:13,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:13,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:13,994] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:14,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:32:15,378] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:00,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:00,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:01,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:02,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:02,003] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:02,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:03,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:03,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:03,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:03,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:03,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:04,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:04,970] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:05,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:05,760] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:05,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:49,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:49,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:50,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:50,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:51,071] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:51,924] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:52,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:53,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:53,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:53,308] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:53,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:54,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:54,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:55,283] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:55,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:33:57,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:38,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:38,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:39,789] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:39,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:41,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:42,359] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:42,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:42,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:43,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:43,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:44,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:45,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:45,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:45,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:46,404] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:34:47,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 18:35:31,713 (deepspeed_trainer:228) INFO: 38epoch:train:11201-11300batch: iter_time=2.540, loss_ctc=64.213, loss_att=45.822, acc=0.741, loss=51.327, grad_norm=5.117, loss_scale=1.000, learning_rate=6.510e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-08 18:36:05,443 (deepspeed_trainer:228) INFO: 38epoch:train:11301-11400batch: iter_time=1.058e-04, loss_ctc=72.839, loss_att=59.504, acc=0.709, loss=63.524, grad_norm=6.830, loss_scale=1.000, learning_rate=6.510e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 18:36:38,818 (deepspeed_trainer:228) INFO: 38epoch:train:11401-11500batch: iter_time=1.076e-04, loss_ctc=77.875, loss_att=57.418, acc=0.722, loss=63.574, grad_norm=6.582, loss_scale=1.000, learning_rate=6.509e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:37:11,855 (deepspeed_trainer:228) INFO: 38epoch:train:11501-11600batch: iter_time=1.113e-04, loss_ctc=61.227, loss_att=47.068, acc=0.738, loss=51.315, grad_norm=5.666, loss_scale=1.000, learning_rate=6.509e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:37:44,755 (deepspeed_trainer:228) INFO: 38epoch:train:11601-11700batch: iter_time=1.185e-04, loss_ctc=65.597, loss_att=45.475, acc=0.735, loss=51.505, grad_norm=5.522, loss_scale=1.000, learning_rate=6.508e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 18:38:17,945 (deepspeed_trainer:228) INFO: 38epoch:train:11701-11800batch: iter_time=1.090e-04, loss_ctc=72.160, loss_att=52.624, acc=0.731, loss=58.484, grad_norm=5.556, loss_scale=1.000, learning_rate=6.507e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:38:51,162 (deepspeed_trainer:228) INFO: 38epoch:train:11801-11900batch: iter_time=1.098e-04, loss_ctc=67.028, loss_att=53.349, acc=0.731, loss=57.434, grad_norm=5.335, loss_scale=1.000, learning_rate=6.507e-05, step_time=0.332 [2024-12-08 18:39:24,231] [INFO] [logging.py:129:log_dist] [Rank 0] step=567000, skipped=0, lr=[np.float64(6.505994749117311e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:39:24,231] [INFO] [timer.py:264:stop] epoch=0/micro_step=177000/global_step=177000, RunningAvgSamplesPerSec=49.227010656279006, CurrSamplesPerSec=49.25594034092487, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:39:24,234 (deepspeed_trainer:228) INFO: 38epoch:train:11901-12000batch: iter_time=1.105e-04, loss_ctc=72.800, loss_att=53.304, acc=0.723, loss=59.180, grad_norm=5.818, loss_scale=1.000, learning_rate=6.506e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:39:57,488 (deepspeed_trainer:228) INFO: 38epoch:train:12001-12100batch: iter_time=1.087e-04, loss_ctc=79.488, loss_att=58.409, acc=0.711, loss=64.721, grad_norm=6.335, loss_scale=1.000, learning_rate=6.506e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:40:30,625 (deepspeed_trainer:228) INFO: 38epoch:train:12101-12200batch: iter_time=1.098e-04, loss_ctc=66.660, loss_att=49.261, acc=0.729, loss=54.490, grad_norm=5.420, loss_scale=1.000, learning_rate=6.505e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:41:03,747 (deepspeed_trainer:228) INFO: 38epoch:train:12201-12300batch: iter_time=1.120e-04, loss_ctc=60.972, loss_att=47.236, acc=0.723, loss=51.368, grad_norm=5.983, loss_scale=1.000, learning_rate=6.505e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:41:37,011 (deepspeed_trainer:228) INFO: 38epoch:train:12301-12400batch: iter_time=1.103e-04, loss_ctc=74.914, loss_att=56.769, acc=0.724, loss=62.235, grad_norm=6.176, loss_scale=1.000, learning_rate=6.504e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:42:10,363 (deepspeed_trainer:228) INFO: 38epoch:train:12401-12500batch: iter_time=1.099e-04, loss_ctc=72.019, loss_att=60.105, acc=0.714, loss=63.703, grad_norm=6.527, loss_scale=1.000, learning_rate=6.503e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:42:43,589 (deepspeed_trainer:228) INFO: 38epoch:train:12501-12600batch: iter_time=1.141e-04, loss_ctc=77.859, loss_att=55.340, acc=0.719, loss=62.126, grad_norm=6.216, loss_scale=1.000, learning_rate=6.503e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:43:16,967 (deepspeed_trainer:228) INFO: 38epoch:train:12601-12700batch: iter_time=1.128e-04, loss_ctc=72.799, loss_att=56.573, acc=0.710, loss=61.443, grad_norm=6.284, loss_scale=1.000, learning_rate=6.502e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:43:50,426 (deepspeed_trainer:228) INFO: 38epoch:train:12701-12800batch: iter_time=1.097e-04, loss_ctc=74.950, loss_att=54.205, acc=0.723, loss=60.416, grad_norm=6.699, loss_scale=1.000, learning_rate=6.502e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 18:44:23,734 (deepspeed_trainer:228) INFO: 38epoch:train:12801-12900batch: iter_time=1.101e-04, loss_ctc=69.758, loss_att=56.992, acc=0.717, loss=60.842, grad_norm=6.122, loss_scale=1.000, learning_rate=6.501e-05, step_time=0.333 [2024-12-08 18:44:56,927] [INFO] [logging.py:129:log_dist] [Rank 0] step=568000, skipped=0, lr=[np.float64(6.500265128199738e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:44:56,928] [INFO] [timer.py:264:stop] epoch=0/micro_step=178000/global_step=178000, RunningAvgSamplesPerSec=49.229289595021555, CurrSamplesPerSec=49.407668000639404, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:44:56,930 (deepspeed_trainer:228) INFO: 38epoch:train:12901-13000batch: iter_time=1.114e-04, loss_ctc=73.061, loss_att=55.150, acc=0.718, loss=60.525, grad_norm=6.125, loss_scale=1.000, learning_rate=6.501e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:45:29,799 (deepspeed_trainer:228) INFO: 38epoch:train:13001-13100batch: iter_time=1.107e-04, loss_ctc=66.634, loss_att=48.179, acc=0.726, loss=53.747, grad_norm=5.584, loss_scale=1.000, learning_rate=6.500e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 18:45:41,728 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 18:46:09,464 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 18:46:27,006 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 18:46:27,006 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 18:46:27,008 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 18:46:52,580] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:52,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:53,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:53,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:53,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:54,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:54,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:54,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:54,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:54,871] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:55,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:55,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:55,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:55,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:56,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:46:56,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:40,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:41,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:42,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:42,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:43,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:43,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:44,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:44,348] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:45,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:45,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:46,345] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:47,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:47,363] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:47,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:47,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:47:47,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:28,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:31,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:32,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:32,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:33,339] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:33,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:34,672] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:35,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:35,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:36,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:36,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:37,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:37,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:38,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:39,262] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:48:40,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:16,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:20,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:21,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:21,328] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:22,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:22,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:22,818] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:23,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:24,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:26,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:26,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:27,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:27,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:28,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:28,790] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 18:49:33,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 18:50:24,760 (deepspeed_trainer:228) INFO: 38epoch:train:13101-13200batch: iter_time=2.580, loss_ctc=64.362, loss_att=45.305, acc=0.738, loss=51.029, grad_norm=5.482, loss_scale=1.000, learning_rate=6.499e-05, step_time=0.370 [cnode7-012:0/16] 2024-12-08 18:50:57,843 (deepspeed_trainer:228) INFO: 38epoch:train:13201-13300batch: iter_time=1.178e-04, loss_ctc=71.189, loss_att=57.915, acc=0.711, loss=61.891, grad_norm=7.152, loss_scale=1.000, learning_rate=6.499e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 18:51:31,432 (deepspeed_trainer:228) INFO: 38epoch:train:13301-13400batch: iter_time=1.106e-04, loss_ctc=81.407, loss_att=60.766, acc=0.725, loss=66.985, grad_norm=6.510, loss_scale=1.000, learning_rate=6.498e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:52:04,618 (deepspeed_trainer:228) INFO: 38epoch:train:13401-13500batch: iter_time=1.105e-04, loss_ctc=61.792, loss_att=45.494, acc=0.734, loss=50.401, grad_norm=6.368, loss_scale=1.000, learning_rate=6.498e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 18:52:37,928 (deepspeed_trainer:228) INFO: 38epoch:train:13501-13600batch: iter_time=1.117e-04, loss_ctc=66.734, loss_att=46.314, acc=0.733, loss=52.438, grad_norm=5.634, loss_scale=1.000, learning_rate=6.497e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:53:11,259 (deepspeed_trainer:228) INFO: 38epoch:train:13601-13700batch: iter_time=1.089e-04, loss_ctc=70.022, loss_att=50.468, acc=0.732, loss=56.327, grad_norm=5.761, loss_scale=1.000, learning_rate=6.497e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:53:45,512 (deepspeed_trainer:228) INFO: 38epoch:train:13701-13800batch: iter_time=1.143e-04, loss_ctc=67.142, loss_att=53.188, acc=0.735, loss=57.392, grad_norm=4.934, loss_scale=1.000, learning_rate=6.496e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 18:54:18,683 (deepspeed_trainer:228) INFO: 38epoch:train:13801-13900batch: iter_time=1.122e-04, loss_ctc=82.887, loss_att=57.856, acc=0.717, loss=65.351, grad_norm=7.198, loss_scale=1.000, learning_rate=6.495e-05, step_time=0.331 [2024-12-08 18:54:52,180] [INFO] [logging.py:129:log_dist] [Rank 0] step=569000, skipped=0, lr=[np.float64(6.494550618348603e-05)], mom=[[0.9, 0.98]] [2024-12-08 18:54:52,180] [INFO] [timer.py:264:stop] epoch=0/micro_step=179000/global_step=179000, RunningAvgSamplesPerSec=49.22730123749166, CurrSamplesPerSec=49.843066537502374, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 18:54:52,183 (deepspeed_trainer:228) INFO: 38epoch:train:13901-14000batch: iter_time=1.120e-04, loss_ctc=75.074, loss_att=58.117, acc=0.710, loss=63.224, grad_norm=6.457, loss_scale=1.000, learning_rate=6.495e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 18:55:25,091 (deepspeed_trainer:228) INFO: 38epoch:train:14001-14100batch: iter_time=1.114e-04, loss_ctc=58.972, loss_att=43.527, acc=0.734, loss=48.182, grad_norm=5.140, loss_scale=1.000, learning_rate=6.494e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 18:55:58,378 (deepspeed_trainer:228) INFO: 38epoch:train:14101-14200batch: iter_time=1.105e-04, loss_ctc=64.261, loss_att=49.481, acc=0.727, loss=53.934, grad_norm=5.833, loss_scale=1.000, learning_rate=6.494e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:56:31,843 (deepspeed_trainer:228) INFO: 38epoch:train:14201-14300batch: iter_time=1.139e-04, loss_ctc=76.196, loss_att=57.746, acc=0.718, loss=63.312, grad_norm=6.315, loss_scale=1.000, learning_rate=6.493e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 18:57:05,203 (deepspeed_trainer:228) INFO: 38epoch:train:14301-14400batch: iter_time=1.125e-04, loss_ctc=68.911, loss_att=56.684, acc=0.720, loss=60.338, grad_norm=6.424, loss_scale=1.000, learning_rate=6.493e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 18:57:38,824 (deepspeed_trainer:228) INFO: 38epoch:train:14401-14500batch: iter_time=1.092e-04, loss_ctc=80.941, loss_att=57.695, acc=0.710, loss=64.636, grad_norm=6.215, loss_scale=1.000, learning_rate=6.492e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 18:58:11,963 (deepspeed_trainer:228) INFO: 38epoch:train:14501-14600batch: iter_time=1.110e-04, loss_ctc=67.513, loss_att=54.456, acc=0.720, loss=58.365, grad_norm=5.696, loss_scale=1.000, learning_rate=6.491e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 18:58:44,775 (deepspeed_trainer:228) INFO: 38epoch:train:14601-14700batch: iter_time=1.090e-04, loss_ctc=75.284, loss_att=52.130, acc=0.719, loss=59.107, grad_norm=6.095, loss_scale=1.000, learning_rate=6.491e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 18:59:17,679 (deepspeed_trainer:228) INFO: 38epoch:train:14701-14800batch: iter_time=1.098e-04, loss_ctc=74.099, loss_att=59.877, acc=0.720, loss=64.155, grad_norm=6.368, loss_scale=1.000, learning_rate=6.490e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 18:59:50,321 (deepspeed_trainer:228) INFO: 38epoch:train:14801-14900batch: iter_time=1.098e-04, loss_ctc=68.514, loss_att=50.528, acc=0.722, loss=55.901, grad_norm=6.375, loss_scale=1.000, learning_rate=6.490e-05, step_time=0.326 [2024-12-08 19:00:22,949] [INFO] [logging.py:129:log_dist] [Rank 0] step=570000, skipped=0, lr=[np.float64(6.488851153258302e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:00:22,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=180000/global_step=180000, RunningAvgSamplesPerSec=49.23077751322501, CurrSamplesPerSec=50.069651253267146, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:00:22,952 (deepspeed_trainer:228) INFO: 38epoch:train:14901-15000batch: iter_time=1.098e-04, loss_ctc=69.987, loss_att=52.439, acc=0.719, loss=57.712, grad_norm=5.989, loss_scale=1.000, learning_rate=6.489e-05, step_time=0.326 [2024-12-08 19:00:38,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:38,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:38,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,078] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,520] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:39,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:53,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:54,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:54,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:54,587] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:54,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:55,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:55,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:55,270] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:55,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:55,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:56,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:56,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:56,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:56,677] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:56,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:00:56,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:09,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:10,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:10,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:10,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,054] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,964] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:11,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:12,490] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:12,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:13,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:13,253] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:13,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:25,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:25,513] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:26,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:26,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:26,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:26,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:26,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:27,282] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:26,976] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:27,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:27,499] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:27,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:28,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:28,146] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:28,591] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:28,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:01:41,311] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 38 is about to be saved! [2024-12-08 19:01:41,341] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/mp_rank_00_model_states.pt [2024-12-08 19:01:41,341] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/mp_rank_00_model_states.pt... [2024-12-08 19:01:43,272] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/mp_rank_00_model_states.pt. [2024-12-08 19:01:43,430] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,431] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,102] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,103] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,104] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,105] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,433] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,434] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,434] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,435] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,435] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,108] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,108] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,108] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,109] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 19:01:43,438] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 19:01:44,181] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,181] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,181] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,216] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,216] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,217] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,228] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,228] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,228] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,236] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,908] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,908] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,908] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,242] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,242] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,924] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,925] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,925] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,927] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,927] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,927] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,265] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,266] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,266] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,271] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,271] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,272] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,272] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,272] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,272] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,946] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,946] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,946] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,947] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 19:01:44,276] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,947] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 19:01:44,276] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,947] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:44,276] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,949] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,950] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,950] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,950] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,950] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,950] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [2024-12-08 19:01:43,960] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 19:01:43,961] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_38/38/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 19:01:43,961] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 38 is ready now! [cnode7-012:0/16] 2024-12-08 19:01:44,308 (deepspeed_trainer:158) INFO: 38epoch results: [train] iter_time=0.137, loss_ctc=71.472, loss_att=53.825, acc=0.723, loss=59.122, grad_norm=5.957, loss_scale=1.000, learning_rate=6.532e-05, step_time=0.335, time=1 hour, 58 minutes and 1.08 seconds, total_count=570038, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.312, cer_ctc=0.077, loss_att=5.188, acc=0.911, cer=0.087, wer=0.750, loss=4.625, time=1 minute and 9.06 seconds, total_count=38, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 19:01:46,173 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 19:02:14,044 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 19:02:30,389 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 19:02:30,389 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 19:02:30,391 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 19:02:50,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:50,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:50,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:50,885] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:50,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:50,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:51,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:51,008] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:51,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:51,742] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:52,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:52,192] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:52,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:52,941] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:52,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:02:54,974] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:38,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:39,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:40,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:40,397] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:40,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:40,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:41,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:42,127] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:41,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:42,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:42,610] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:43,009] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:43,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:43,742] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:44,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:03:49,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:25,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:27,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:28,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:28,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:28,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:30,419] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:30,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:31,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:31,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:32,540] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:33,104] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:32,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:33,433] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:34,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:36,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:04:43,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:14,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:15,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:16,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:17,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:17,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:18,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:18,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:19,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:19,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:19,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:21,298] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:21,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:22,823] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:23,245] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:26,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:05:33,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 19:06:34,501 (deepspeed_trainer:228) INFO: 39epoch:train:1-100batch: iter_time=2.460, loss_ctc=71.638, loss_att=55.405, acc=0.720, loss=60.248, grad_norm=6.151, loss_scale=1.000, learning_rate=6.489e-05, step_time=0.424 [cnode7-012:0/16] 2024-12-08 19:07:08,072 (deepspeed_trainer:228) INFO: 39epoch:train:101-200batch: iter_time=1.159e-04, loss_ctc=67.361, loss_att=52.295, acc=0.725, loss=56.820, grad_norm=5.613, loss_scale=1.000, learning_rate=6.488e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:07:41,515 (deepspeed_trainer:228) INFO: 39epoch:train:201-300batch: iter_time=1.065e-04, loss_ctc=67.150, loss_att=48.659, acc=0.723, loss=54.208, grad_norm=6.352, loss_scale=1.000, learning_rate=6.487e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:08:15,133 (deepspeed_trainer:228) INFO: 39epoch:train:301-400batch: iter_time=1.104e-04, loss_ctc=71.588, loss_att=52.498, acc=0.728, loss=58.231, grad_norm=6.395, loss_scale=1.000, learning_rate=6.487e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 19:08:48,735 (deepspeed_trainer:228) INFO: 39epoch:train:401-500batch: iter_time=1.107e-04, loss_ctc=71.064, loss_att=51.559, acc=0.725, loss=57.407, grad_norm=6.146, loss_scale=1.000, learning_rate=6.486e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:09:22,726 (deepspeed_trainer:228) INFO: 39epoch:train:501-600batch: iter_time=1.115e-04, loss_ctc=74.959, loss_att=61.162, acc=0.717, loss=65.292, grad_norm=6.549, loss_scale=1.000, learning_rate=6.486e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 19:09:56,672 (deepspeed_trainer:228) INFO: 39epoch:train:601-700batch: iter_time=1.106e-04, loss_ctc=77.567, loss_att=55.685, acc=0.723, loss=62.244, grad_norm=6.358, loss_scale=1.000, learning_rate=6.485e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 19:10:29,707 (deepspeed_trainer:228) INFO: 39epoch:train:701-800batch: iter_time=1.119e-04, loss_ctc=64.771, loss_att=44.133, acc=0.732, loss=50.307, grad_norm=5.975, loss_scale=1.000, learning_rate=6.485e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 19:11:03,573 (deepspeed_trainer:228) INFO: 39epoch:train:801-900batch: iter_time=1.076e-04, loss_ctc=77.410, loss_att=60.478, acc=0.709, loss=65.577, grad_norm=6.230, loss_scale=1.000, learning_rate=6.484e-05, step_time=0.338 [2024-12-08 19:11:37,352] [INFO] [logging.py:129:log_dist] [Rank 0] step=571000, skipped=0, lr=[np.float64(6.483166667029841e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:11:37,352] [INFO] [timer.py:264:stop] epoch=0/micro_step=181000/global_step=181000, RunningAvgSamplesPerSec=49.22243357081049, CurrSamplesPerSec=47.840377300578524, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:11:37,356 (deepspeed_trainer:228) INFO: 39epoch:train:901-1000batch: iter_time=1.089e-04, loss_ctc=81.639, loss_att=59.705, acc=0.732, loss=66.297, grad_norm=6.552, loss_scale=1.000, learning_rate=6.483e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 19:12:10,493 (deepspeed_trainer:228) INFO: 39epoch:train:1001-1100batch: iter_time=1.058e-04, loss_ctc=69.744, loss_att=52.839, acc=0.725, loss=57.937, grad_norm=5.553, loss_scale=1.000, learning_rate=6.483e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 19:12:43,558 (deepspeed_trainer:228) INFO: 39epoch:train:1101-1200batch: iter_time=1.101e-04, loss_ctc=70.669, loss_att=49.859, acc=0.737, loss=56.098, grad_norm=6.140, loss_scale=1.000, learning_rate=6.482e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 19:13:17,137 (deepspeed_trainer:228) INFO: 39epoch:train:1201-1300batch: iter_time=1.187e-04, loss_ctc=70.514, loss_att=56.261, acc=0.715, loss=60.549, grad_norm=5.445, loss_scale=1.000, learning_rate=6.482e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:13:50,870 (deepspeed_trainer:228) INFO: 39epoch:train:1301-1400batch: iter_time=1.089e-04, loss_ctc=76.056, loss_att=61.106, acc=0.718, loss=65.579, grad_norm=6.197, loss_scale=1.000, learning_rate=6.481e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:14:24,406 (deepspeed_trainer:228) INFO: 39epoch:train:1401-1500batch: iter_time=1.146e-04, loss_ctc=72.831, loss_att=52.698, acc=0.743, loss=58.760, grad_norm=5.689, loss_scale=1.000, learning_rate=6.481e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:14:57,922 (deepspeed_trainer:228) INFO: 39epoch:train:1501-1600batch: iter_time=1.124e-04, loss_ctc=73.380, loss_att=53.414, acc=0.726, loss=59.429, grad_norm=6.354, loss_scale=1.000, learning_rate=6.480e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:15:31,423 (deepspeed_trainer:228) INFO: 39epoch:train:1601-1700batch: iter_time=1.120e-04, loss_ctc=61.758, loss_att=41.785, acc=0.730, loss=47.781, grad_norm=5.394, loss_scale=1.000, learning_rate=6.479e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:16:05,477 (deepspeed_trainer:228) INFO: 39epoch:train:1701-1800batch: iter_time=1.159e-04, loss_ctc=80.147, loss_att=65.219, acc=0.714, loss=69.674, grad_norm=6.036, loss_scale=1.000, learning_rate=6.479e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 19:16:35,648 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 19:17:01,303 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 19:17:18,183 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 19:17:18,183 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 19:17:18,185 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 19:17:50,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,240] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:51,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:17:50,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:42,735] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:42,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:43,196] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:43,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:43,771] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:43,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:43,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:43,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,655] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,703] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,919] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:18:44,493] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:31,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:31,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:32,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:32,785] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:32,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:33,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:32,819] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:33,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:33,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:33,340] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:33,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:33,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:34,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:34,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:36,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:19:37,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:20,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:21,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:21,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:22,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:22,719] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:23,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:23,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:23,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:24,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:23,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:24,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:25,044] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:24,385] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:26,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:29,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:20:29,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 19:21:04,497 (deepspeed_trainer:228) INFO: 39epoch:train:1801-1900batch: iter_time=2.595, loss_ctc=70.037, loss_att=55.499, acc=0.723, loss=59.865, grad_norm=5.918, loss_scale=1.000, learning_rate=6.478e-05, step_time=0.401 [2024-12-08 19:21:38,218] [INFO] [logging.py:129:log_dist] [Rank 0] step=572000, skipped=0, lr=[np.float64(6.477497094167622e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:21:38,218] [INFO] [timer.py:264:stop] epoch=0/micro_step=182000/global_step=182000, RunningAvgSamplesPerSec=49.21734783405605, CurrSamplesPerSec=45.80487180760001, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:21:38,221 (deepspeed_trainer:228) INFO: 39epoch:train:1901-2000batch: iter_time=1.115e-04, loss_ctc=75.062, loss_att=57.065, acc=0.721, loss=62.478, grad_norm=6.187, loss_scale=1.000, learning_rate=6.478e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 19:22:11,271 (deepspeed_trainer:228) INFO: 39epoch:train:2001-2100batch: iter_time=1.142e-04, loss_ctc=60.017, loss_att=43.018, acc=0.731, loss=48.101, grad_norm=6.083, loss_scale=1.000, learning_rate=6.477e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 19:22:44,608 (deepspeed_trainer:228) INFO: 39epoch:train:2101-2200batch: iter_time=1.062e-04, loss_ctc=72.832, loss_att=55.222, acc=0.721, loss=60.500, grad_norm=6.354, loss_scale=1.000, learning_rate=6.477e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:23:17,829 (deepspeed_trainer:228) INFO: 39epoch:train:2201-2300batch: iter_time=1.077e-04, loss_ctc=67.087, loss_att=46.865, acc=0.739, loss=52.951, grad_norm=5.709, loss_scale=1.000, learning_rate=6.476e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 19:23:51,823 (deepspeed_trainer:228) INFO: 39epoch:train:2301-2400batch: iter_time=1.093e-04, loss_ctc=73.579, loss_att=58.596, acc=0.718, loss=63.062, grad_norm=6.204, loss_scale=1.000, learning_rate=6.476e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 19:24:26,068 (deepspeed_trainer:228) INFO: 39epoch:train:2401-2500batch: iter_time=1.064e-04, loss_ctc=67.581, loss_att=52.479, acc=0.725, loss=56.999, grad_norm=6.104, loss_scale=1.000, learning_rate=6.475e-05, step_time=0.342 [cnode7-012:0/16] 2024-12-08 19:24:59,585 (deepspeed_trainer:228) INFO: 39epoch:train:2501-2600batch: iter_time=1.066e-04, loss_ctc=79.099, loss_att=55.197, acc=0.726, loss=62.360, grad_norm=5.637, loss_scale=1.000, learning_rate=6.474e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:25:32,827 (deepspeed_trainer:228) INFO: 39epoch:train:2601-2700batch: iter_time=1.067e-04, loss_ctc=67.429, loss_att=45.413, acc=0.734, loss=52.015, grad_norm=5.724, loss_scale=1.000, learning_rate=6.474e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:26:06,810 (deepspeed_trainer:228) INFO: 39epoch:train:2701-2800batch: iter_time=1.075e-04, loss_ctc=76.810, loss_att=61.514, acc=0.719, loss=66.114, grad_norm=6.580, loss_scale=1.000, learning_rate=6.473e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 19:26:40,105 (deepspeed_trainer:228) INFO: 39epoch:train:2801-2900batch: iter_time=1.066e-04, loss_ctc=75.980, loss_att=55.263, acc=0.728, loss=61.460, grad_norm=5.944, loss_scale=1.000, learning_rate=6.473e-05, step_time=0.332 [2024-12-08 19:27:13,231] [INFO] [logging.py:129:log_dist] [Rank 0] step=573000, skipped=0, lr=[np.float64(6.47184236957629e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:27:13,232] [INFO] [timer.py:264:stop] epoch=0/micro_step=183000/global_step=183000, RunningAvgSamplesPerSec=49.217402772479005, CurrSamplesPerSec=50.25585484720521, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:27:13,235 (deepspeed_trainer:228) INFO: 39epoch:train:2901-3000batch: iter_time=1.083e-04, loss_ctc=69.709, loss_att=52.637, acc=0.734, loss=57.725, grad_norm=5.409, loss_scale=1.000, learning_rate=6.472e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 19:27:46,535 (deepspeed_trainer:228) INFO: 39epoch:train:3001-3100batch: iter_time=1.079e-04, loss_ctc=69.852, loss_att=49.792, acc=0.736, loss=55.785, grad_norm=5.206, loss_scale=1.000, learning_rate=6.472e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 19:28:19,838 (deepspeed_trainer:228) INFO: 39epoch:train:3101-3200batch: iter_time=1.083e-04, loss_ctc=70.658, loss_att=55.929, acc=0.714, loss=60.343, grad_norm=5.553, loss_scale=1.000, learning_rate=6.471e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:28:53,134 (deepspeed_trainer:228) INFO: 39epoch:train:3201-3300batch: iter_time=1.083e-04, loss_ctc=74.786, loss_att=57.396, acc=0.728, loss=62.589, grad_norm=6.423, loss_scale=1.000, learning_rate=6.470e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:29:26,345 (deepspeed_trainer:228) INFO: 39epoch:train:3301-3400batch: iter_time=1.087e-04, loss_ctc=72.810, loss_att=56.336, acc=0.743, loss=61.265, grad_norm=5.811, loss_scale=1.000, learning_rate=6.470e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 19:29:59,046 (deepspeed_trainer:228) INFO: 39epoch:train:3401-3500batch: iter_time=1.077e-04, loss_ctc=67.368, loss_att=48.237, acc=0.722, loss=53.962, grad_norm=5.477, loss_scale=1.000, learning_rate=6.469e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 19:30:31,898 (deepspeed_trainer:228) INFO: 39epoch:train:3501-3600batch: iter_time=1.094e-04, loss_ctc=66.274, loss_att=44.754, acc=0.731, loss=51.198, grad_norm=5.830, loss_scale=1.000, learning_rate=6.469e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 19:31:05,554 (deepspeed_trainer:228) INFO: 39epoch:train:3601-3700batch: iter_time=1.080e-04, loss_ctc=79.467, loss_att=62.847, acc=0.724, loss=67.876, grad_norm=5.879, loss_scale=1.000, learning_rate=6.468e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 19:31:25,839 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 19:31:52,601 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 19:32:11,371 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 19:32:11,371 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 19:32:11,373 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 19:32:35,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:36,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:36,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:36,943] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:37,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:37,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:37,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:38,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:38,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:39,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:39,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:39,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:40,195] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:39,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:39,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:32:39,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:25,075] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:25,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:25,798] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:26,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:27,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:26,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:27,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:28,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:28,706] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:28,174] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:29,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:29,260] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:29,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:29,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:32,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:33:33,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:14,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:14,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:14,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:15,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:15,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:15,701] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:16,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:17,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:16,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:17,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:18,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:18,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:19,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:19,428] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:22,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:34:23,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:03,111] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:04,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:04,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:03,887] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:05,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:05,417] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:05,342] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:06,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:07,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:06,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:07,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:08,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:08,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:10,989] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:12,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:35:16,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 19:35:59,648 (deepspeed_trainer:228) INFO: 39epoch:train:3701-3800batch: iter_time=2.565, loss_ctc=70.439, loss_att=57.234, acc=0.722, loss=61.219, grad_norm=5.996, loss_scale=1.000, learning_rate=6.468e-05, step_time=0.376 [cnode7-012:0/16] 2024-12-08 19:36:33,456 (deepspeed_trainer:228) INFO: 39epoch:train:3801-3900batch: iter_time=1.137e-04, loss_ctc=68.162, loss_att=54.346, acc=0.719, loss=58.481, grad_norm=6.209, loss_scale=1.000, learning_rate=6.467e-05, step_time=0.338 [2024-12-08 19:37:06,862] [INFO] [logging.py:129:log_dist] [Rank 0] step=574000, skipped=0, lr=[np.float64(6.466202428557576e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:37:06,865] [INFO] [timer.py:264:stop] epoch=0/micro_step=184000/global_step=184000, RunningAvgSamplesPerSec=49.21577982821655, CurrSamplesPerSec=49.297659383623724, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:37:06,869 (deepspeed_trainer:228) INFO: 39epoch:train:3901-4000batch: iter_time=1.078e-04, loss_ctc=60.224, loss_att=40.936, acc=0.736, loss=46.746, grad_norm=5.663, loss_scale=1.000, learning_rate=6.466e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:37:40,229 (deepspeed_trainer:228) INFO: 39epoch:train:4001-4100batch: iter_time=1.099e-04, loss_ctc=75.157, loss_att=56.941, acc=0.725, loss=62.432, grad_norm=7.020, loss_scale=1.000, learning_rate=6.466e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:38:13,321 (deepspeed_trainer:228) INFO: 39epoch:train:4101-4200batch: iter_time=1.072e-04, loss_ctc=68.162, loss_att=46.406, acc=0.740, loss=52.947, grad_norm=5.564, loss_scale=1.000, learning_rate=6.465e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 19:38:46,583 (deepspeed_trainer:228) INFO: 39epoch:train:4201-4300batch: iter_time=1.072e-04, loss_ctc=72.207, loss_att=58.417, acc=0.719, loss=62.544, grad_norm=6.296, loss_scale=1.000, learning_rate=6.465e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 19:39:20,516 (deepspeed_trainer:228) INFO: 39epoch:train:4301-4400batch: iter_time=1.206e-04, loss_ctc=68.661, loss_att=53.303, acc=0.726, loss=57.923, grad_norm=6.126, loss_scale=1.000, learning_rate=6.464e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 19:39:53,461 (deepspeed_trainer:228) INFO: 39epoch:train:4401-4500batch: iter_time=1.108e-04, loss_ctc=72.729, loss_att=51.694, acc=0.727, loss=57.995, grad_norm=6.129, loss_scale=1.000, learning_rate=6.464e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 19:40:27,037 (deepspeed_trainer:228) INFO: 39epoch:train:4501-4600batch: iter_time=1.103e-04, loss_ctc=73.293, loss_att=47.658, acc=0.736, loss=55.344, grad_norm=6.263, loss_scale=1.000, learning_rate=6.463e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:41:00,474 (deepspeed_trainer:228) INFO: 39epoch:train:4601-4700batch: iter_time=1.086e-04, loss_ctc=80.356, loss_att=66.566, acc=0.719, loss=70.697, grad_norm=6.316, loss_scale=1.000, learning_rate=6.463e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:41:33,424 (deepspeed_trainer:228) INFO: 39epoch:train:4701-4800batch: iter_time=1.088e-04, loss_ctc=67.741, loss_att=48.148, acc=0.735, loss=53.992, grad_norm=6.121, loss_scale=1.000, learning_rate=6.462e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 19:42:06,790 (deepspeed_trainer:228) INFO: 39epoch:train:4801-4900batch: iter_time=1.101e-04, loss_ctc=79.737, loss_att=57.067, acc=0.730, loss=63.874, grad_norm=6.232, loss_scale=1.000, learning_rate=6.461e-05, step_time=0.333 [2024-12-08 19:42:39,881] [INFO] [logging.py:129:log_dist] [Rank 0] step=575000, skipped=0, lr=[np.float64(6.460577206807205e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:42:39,882] [INFO] [timer.py:264:stop] epoch=0/micro_step=185000/global_step=185000, RunningAvgSamplesPerSec=49.217632506080015, CurrSamplesPerSec=47.31506888792029, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:42:39,885 (deepspeed_trainer:228) INFO: 39epoch:train:4901-5000batch: iter_time=1.097e-04, loss_ctc=62.240, loss_att=49.722, acc=0.726, loss=53.459, grad_norm=4.918, loss_scale=1.000, learning_rate=6.461e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 19:43:14,409 (deepspeed_trainer:228) INFO: 39epoch:train:5001-5100batch: iter_time=1.102e-04, loss_ctc=71.581, loss_att=54.951, acc=0.727, loss=59.940, grad_norm=5.208, loss_scale=1.000, learning_rate=6.460e-05, step_time=0.341 [cnode7-012:0/16] 2024-12-08 19:43:47,769 (deepspeed_trainer:228) INFO: 39epoch:train:5101-5200batch: iter_time=1.085e-04, loss_ctc=78.208, loss_att=57.731, acc=0.730, loss=63.837, grad_norm=6.635, loss_scale=1.000, learning_rate=6.460e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:44:20,980 (deepspeed_trainer:228) INFO: 39epoch:train:5201-5300batch: iter_time=1.141e-04, loss_ctc=71.263, loss_att=54.671, acc=0.737, loss=59.674, grad_norm=5.221, loss_scale=1.000, learning_rate=6.459e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 19:44:54,245 (deepspeed_trainer:228) INFO: 39epoch:train:5301-5400batch: iter_time=1.091e-04, loss_ctc=68.855, loss_att=47.623, acc=0.727, loss=53.978, grad_norm=5.566, loss_scale=1.000, learning_rate=6.459e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 19:45:27,257 (deepspeed_trainer:228) INFO: 39epoch:train:5401-5500batch: iter_time=1.110e-04, loss_ctc=63.744, loss_att=45.812, acc=0.732, loss=51.193, grad_norm=5.729, loss_scale=1.000, learning_rate=6.458e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 19:46:00,612 (deepspeed_trainer:228) INFO: 39epoch:train:5501-5600batch: iter_time=1.120e-04, loss_ctc=74.328, loss_att=61.662, acc=0.717, loss=65.477, grad_norm=5.936, loss_scale=1.000, learning_rate=6.457e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:46:13,327 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 19:46:40,399 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 19:46:57,243 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 19:46:57,243 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 19:46:57,246 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 19:47:22,832] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:23,038] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:23,707] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:23,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:24,369] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:25,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:26,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:26,237] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:26,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:26,633] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:26,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:27,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:27,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:27,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:27,693] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:47:28,212] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:11,388] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:12,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:12,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:14,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:15,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:15,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:15,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:16,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:17,207] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:17,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:18,197] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:18,681] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:19,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:19,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:19,336] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:48:19,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:02,179] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:03,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:03,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:04,087] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:04,364] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:04,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:05,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:06,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:07,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:07,250] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:07,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:08,532] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:08,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:09,934] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:10,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:10,168] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:52,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:53,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:54,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:54,999] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:55,683] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:55,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:55,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:56,583] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:56,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:57,472] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:58,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:58,457] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:49:59,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:50:00,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:50:01,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 19:50:01,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 19:50:54,435 (deepspeed_trainer:228) INFO: 39epoch:train:5601-5700batch: iter_time=2.588, loss_ctc=74.146, loss_att=59.249, acc=0.712, loss=63.744, grad_norm=6.214, loss_scale=1.000, learning_rate=6.457e-05, step_time=0.350 [cnode7-012:0/16] 2024-12-08 19:51:28,242 (deepspeed_trainer:228) INFO: 39epoch:train:5701-5800batch: iter_time=1.076e-04, loss_ctc=64.563, loss_att=49.647, acc=0.732, loss=54.109, grad_norm=5.508, loss_scale=1.000, learning_rate=6.456e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 19:52:02,170 (deepspeed_trainer:228) INFO: 39epoch:train:5801-5900batch: iter_time=1.134e-04, loss_ctc=67.375, loss_att=49.335, acc=0.718, loss=54.735, grad_norm=5.833, loss_scale=1.000, learning_rate=6.456e-05, step_time=0.339 [2024-12-08 19:52:35,830] [INFO] [logging.py:129:log_dist] [Rank 0] step=576000, skipped=0, lr=[np.float64(6.454966640411807e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:52:35,831] [INFO] [timer.py:264:stop] epoch=0/micro_step=186000/global_step=186000, RunningAvgSamplesPerSec=49.216282486518416, CurrSamplesPerSec=50.84477880943157, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:52:35,834 (deepspeed_trainer:228) INFO: 39epoch:train:5901-6000batch: iter_time=1.083e-04, loss_ctc=70.121, loss_att=51.039, acc=0.720, loss=56.760, grad_norm=6.471, loss_scale=1.000, learning_rate=6.455e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 19:53:09,177 (deepspeed_trainer:228) INFO: 39epoch:train:6001-6100batch: iter_time=1.057e-04, loss_ctc=64.792, loss_att=47.113, acc=0.732, loss=52.412, grad_norm=5.469, loss_scale=1.000, learning_rate=6.455e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:53:42,633 (deepspeed_trainer:228) INFO: 39epoch:train:6101-6200batch: iter_time=1.061e-04, loss_ctc=77.724, loss_att=60.492, acc=0.718, loss=65.652, grad_norm=6.671, loss_scale=1.000, learning_rate=6.454e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:54:15,516 (deepspeed_trainer:228) INFO: 39epoch:train:6201-6300batch: iter_time=1.068e-04, loss_ctc=71.909, loss_att=54.002, acc=0.717, loss=59.375, grad_norm=6.058, loss_scale=1.000, learning_rate=6.454e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 19:54:48,807 (deepspeed_trainer:228) INFO: 39epoch:train:6301-6400batch: iter_time=1.050e-04, loss_ctc=66.429, loss_att=46.899, acc=0.727, loss=52.759, grad_norm=5.942, loss_scale=1.000, learning_rate=6.453e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:55:22,281 (deepspeed_trainer:228) INFO: 39epoch:train:6401-6500batch: iter_time=1.056e-04, loss_ctc=80.074, loss_att=59.430, acc=0.714, loss=65.644, grad_norm=6.537, loss_scale=1.000, learning_rate=6.452e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 19:55:55,731 (deepspeed_trainer:228) INFO: 39epoch:train:6501-6600batch: iter_time=1.087e-04, loss_ctc=74.381, loss_att=56.241, acc=0.726, loss=61.711, grad_norm=6.257, loss_scale=1.000, learning_rate=6.452e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:56:29,157 (deepspeed_trainer:228) INFO: 39epoch:train:6601-6700batch: iter_time=1.095e-04, loss_ctc=69.977, loss_att=50.841, acc=0.723, loss=56.559, grad_norm=5.508, loss_scale=1.000, learning_rate=6.451e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 19:57:02,475 (deepspeed_trainer:228) INFO: 39epoch:train:6701-6800batch: iter_time=1.083e-04, loss_ctc=73.845, loss_att=54.191, acc=0.729, loss=60.072, grad_norm=6.110, loss_scale=1.000, learning_rate=6.451e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:57:35,684 (deepspeed_trainer:228) INFO: 39epoch:train:6801-6900batch: iter_time=1.152e-04, loss_ctc=65.164, loss_att=49.123, acc=0.728, loss=53.958, grad_norm=5.947, loss_scale=1.000, learning_rate=6.450e-05, step_time=0.332 [2024-12-08 19:58:08,990] [INFO] [logging.py:129:log_dist] [Rank 0] step=577000, skipped=0, lr=[np.float64(6.449370665845867e-05)], mom=[[0.9, 0.98]] [2024-12-08 19:58:08,991] [INFO] [timer.py:264:stop] epoch=0/micro_step=187000/global_step=187000, RunningAvgSamplesPerSec=49.21786062691455, CurrSamplesPerSec=49.95905855066013, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 19:58:08,994 (deepspeed_trainer:228) INFO: 39epoch:train:6901-7000batch: iter_time=1.085e-04, loss_ctc=74.716, loss_att=58.463, acc=0.717, loss=63.348, grad_norm=6.725, loss_scale=1.000, learning_rate=6.450e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 19:58:42,093 (deepspeed_trainer:228) INFO: 39epoch:train:7001-7100batch: iter_time=1.072e-04, loss_ctc=72.507, loss_att=52.048, acc=0.739, loss=58.163, grad_norm=5.571, loss_scale=1.000, learning_rate=6.449e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 19:59:15,031 (deepspeed_trainer:228) INFO: 39epoch:train:7101-7200batch: iter_time=1.052e-04, loss_ctc=68.278, loss_att=51.735, acc=0.723, loss=56.667, grad_norm=5.582, loss_scale=1.000, learning_rate=6.449e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 19:59:47,957 (deepspeed_trainer:228) INFO: 39epoch:train:7201-7300batch: iter_time=1.076e-04, loss_ctc=64.304, loss_att=45.344, acc=0.728, loss=51.016, grad_norm=5.026, loss_scale=1.000, learning_rate=6.448e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 20:00:21,120 (deepspeed_trainer:228) INFO: 39epoch:train:7301-7400batch: iter_time=1.070e-04, loss_ctc=71.811, loss_att=54.234, acc=0.723, loss=59.503, grad_norm=5.747, loss_scale=1.000, learning_rate=6.447e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:00:54,052 (deepspeed_trainer:228) INFO: 39epoch:train:7401-7500batch: iter_time=1.029e-04, loss_ctc=72.866, loss_att=58.720, acc=0.717, loss=62.967, grad_norm=5.481, loss_scale=1.000, learning_rate=6.447e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 20:00:58,139 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 20:01:25,675 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 20:01:41,959 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 20:01:41,959 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 20:01:41,962 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 20:02:08,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:08,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:08,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:09,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:09,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:09,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:10,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:10,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:10,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:10,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:10,638] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:11,073] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:11,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:11,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:11,830] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:11,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:57,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:58,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:58,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:58,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:58,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:58,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:59,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:59,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:59,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:59,801] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:02:59,888] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:00,189] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:00,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:00,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:01,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:04,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:47,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:48,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:48,676] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:48,882] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:49,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:49,750] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:49,751] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:50,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:50,586] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:50,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:51,300] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:51,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:53,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:54,815] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:55,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:03:56,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:38,121] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:38,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:38,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:39,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:40,834] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:41,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:41,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:41,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:41,915] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:42,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:42,324] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:42,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:44,061] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:45,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:47,360] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:04:49,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 20:05:50,733 (deepspeed_trainer:228) INFO: 39epoch:train:7501-7600batch: iter_time=2.588, loss_ctc=69.746, loss_att=53.202, acc=0.721, loss=58.129, grad_norm=5.769, loss_scale=1.000, learning_rate=6.446e-05, step_time=0.379 [cnode7-012:0/16] 2024-12-08 20:06:24,657 (deepspeed_trainer:228) INFO: 39epoch:train:7601-7700batch: iter_time=1.165e-04, loss_ctc=66.396, loss_att=50.145, acc=0.725, loss=55.031, grad_norm=5.338, loss_scale=1.000, learning_rate=6.446e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 20:06:58,458 (deepspeed_trainer:228) INFO: 39epoch:train:7701-7800batch: iter_time=1.125e-04, loss_ctc=66.721, loss_att=46.788, acc=0.727, loss=52.771, grad_norm=5.910, loss_scale=1.000, learning_rate=6.445e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 20:07:31,920 (deepspeed_trainer:228) INFO: 39epoch:train:7801-7900batch: iter_time=1.115e-04, loss_ctc=70.019, loss_att=50.497, acc=0.726, loss=56.369, grad_norm=6.617, loss_scale=1.000, learning_rate=6.445e-05, step_time=0.335 [2024-12-08 20:08:05,589] [INFO] [logging.py:129:log_dist] [Rank 0] step=578000, skipped=0, lr=[np.float64(6.443789219968709e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:08:05,589] [INFO] [timer.py:264:stop] epoch=0/micro_step=188000/global_step=188000, RunningAvgSamplesPerSec=49.21547055012152, CurrSamplesPerSec=48.41388234092368, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:08:05,592 (deepspeed_trainer:228) INFO: 39epoch:train:7901-8000batch: iter_time=1.097e-04, loss_ctc=69.355, loss_att=50.718, acc=0.723, loss=56.313, grad_norm=6.104, loss_scale=1.000, learning_rate=6.444e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 20:08:39,163 (deepspeed_trainer:228) INFO: 39epoch:train:8001-8100batch: iter_time=1.121e-04, loss_ctc=72.835, loss_att=58.068, acc=0.720, loss=62.540, grad_norm=6.034, loss_scale=1.000, learning_rate=6.444e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 20:09:12,376 (deepspeed_trainer:228) INFO: 39epoch:train:8101-8200batch: iter_time=1.111e-04, loss_ctc=76.276, loss_att=54.542, acc=0.722, loss=61.060, grad_norm=6.508, loss_scale=1.000, learning_rate=6.443e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:09:45,306 (deepspeed_trainer:228) INFO: 39epoch:train:8201-8300batch: iter_time=1.123e-04, loss_ctc=62.778, loss_att=42.928, acc=0.737, loss=48.890, grad_norm=5.845, loss_scale=1.000, learning_rate=6.442e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 20:10:18,784 (deepspeed_trainer:228) INFO: 39epoch:train:8301-8400batch: iter_time=1.123e-04, loss_ctc=75.790, loss_att=60.144, acc=0.705, loss=64.823, grad_norm=6.958, loss_scale=1.000, learning_rate=6.442e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:10:52,408 (deepspeed_trainer:228) INFO: 39epoch:train:8401-8500batch: iter_time=1.109e-04, loss_ctc=79.039, loss_att=56.314, acc=0.733, loss=63.126, grad_norm=6.131, loss_scale=1.000, learning_rate=6.441e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 20:11:25,815 (deepspeed_trainer:228) INFO: 39epoch:train:8501-8600batch: iter_time=1.096e-04, loss_ctc=68.519, loss_att=50.020, acc=0.725, loss=55.553, grad_norm=5.416, loss_scale=1.000, learning_rate=6.441e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:11:58,854 (deepspeed_trainer:228) INFO: 39epoch:train:8601-8700batch: iter_time=1.121e-04, loss_ctc=69.573, loss_att=49.843, acc=0.734, loss=55.757, grad_norm=5.578, loss_scale=1.000, learning_rate=6.440e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:12:32,170 (deepspeed_trainer:228) INFO: 39epoch:train:8701-8800batch: iter_time=1.090e-04, loss_ctc=69.161, loss_att=52.945, acc=0.719, loss=57.841, grad_norm=5.598, loss_scale=1.000, learning_rate=6.440e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:13:05,740 (deepspeed_trainer:228) INFO: 39epoch:train:8801-8900batch: iter_time=1.148e-04, loss_ctc=74.508, loss_att=57.937, acc=0.720, loss=62.886, grad_norm=6.124, loss_scale=1.000, learning_rate=6.439e-05, step_time=0.336 [2024-12-08 20:13:39,169] [INFO] [logging.py:129:log_dist] [Rank 0] step=579000, skipped=0, lr=[np.float64(6.438222240021497e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:13:39,169] [INFO] [timer.py:264:stop] epoch=0/micro_step=189000/global_step=189000, RunningAvgSamplesPerSec=49.21649116366906, CurrSamplesPerSec=49.286218515138714, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:13:39,172 (deepspeed_trainer:228) INFO: 39epoch:train:8901-9000batch: iter_time=1.100e-04, loss_ctc=71.347, loss_att=52.268, acc=0.745, loss=58.020, grad_norm=5.639, loss_scale=1.000, learning_rate=6.438e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:14:12,221 (deepspeed_trainer:228) INFO: 39epoch:train:9001-9100batch: iter_time=1.121e-04, loss_ctc=70.486, loss_att=52.766, acc=0.721, loss=58.068, grad_norm=6.059, loss_scale=1.000, learning_rate=6.438e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:14:45,057 (deepspeed_trainer:228) INFO: 39epoch:train:9101-9200batch: iter_time=1.139e-04, loss_ctc=61.002, loss_att=41.126, acc=0.732, loss=47.072, grad_norm=5.343, loss_scale=1.000, learning_rate=6.437e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 20:15:18,153 (deepspeed_trainer:228) INFO: 39epoch:train:9201-9300batch: iter_time=1.137e-04, loss_ctc=79.606, loss_att=62.423, acc=0.714, loss=67.604, grad_norm=6.791, loss_scale=1.000, learning_rate=6.437e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:15:46,988 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 20:16:14,197 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 20:16:30,363 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 20:16:30,363 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 20:16:30,366 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 20:16:56,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:56,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:57,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:58,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:58,526] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:58,920] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:58,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:59,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:59,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:59,313] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:16:59,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:00,187] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:00,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:00,630] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:00,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:01,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:46,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:46,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:47,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:47,210] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:47,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:49,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,132] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,616] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:50,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:17:52,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:35,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:35,537] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:36,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:37,822] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:39,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:39,097] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:39,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:39,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:39,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:40,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:40,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:40,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:40,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:40,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:40,967] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:18:41,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:24,625] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:24,810] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:26,450] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:27,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:28,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:29,148] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:29,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:29,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:29,777] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:30,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:30,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:31,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:31,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:31,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:32,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:19:32,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 20:20:08,950 (deepspeed_trainer:228) INFO: 39epoch:train:9301-9400batch: iter_time=2.564, loss_ctc=68.826, loss_att=54.507, acc=0.722, loss=58.819, grad_norm=5.194, loss_scale=1.000, learning_rate=6.436e-05, step_time=0.344 [cnode7-012:0/16] 2024-12-08 20:20:42,047 (deepspeed_trainer:228) INFO: 39epoch:train:9401-9500batch: iter_time=1.078e-04, loss_ctc=74.198, loss_att=56.168, acc=0.716, loss=61.579, grad_norm=6.356, loss_scale=1.000, learning_rate=6.436e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:21:15,445 (deepspeed_trainer:228) INFO: 39epoch:train:9501-9600batch: iter_time=1.076e-04, loss_ctc=59.413, loss_att=41.075, acc=0.734, loss=46.560, grad_norm=5.158, loss_scale=1.000, learning_rate=6.435e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:21:48,804 (deepspeed_trainer:228) INFO: 39epoch:train:9601-9700batch: iter_time=1.125e-04, loss_ctc=71.700, loss_att=53.387, acc=0.721, loss=58.872, grad_norm=6.856, loss_scale=1.000, learning_rate=6.435e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:22:21,837 (deepspeed_trainer:228) INFO: 39epoch:train:9701-9800batch: iter_time=1.098e-04, loss_ctc=66.299, loss_att=45.605, acc=0.738, loss=51.822, grad_norm=5.896, loss_scale=1.000, learning_rate=6.434e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:22:55,035 (deepspeed_trainer:228) INFO: 39epoch:train:9801-9900batch: iter_time=1.106e-04, loss_ctc=72.842, loss_att=57.544, acc=0.718, loss=62.098, grad_norm=6.488, loss_scale=1.000, learning_rate=6.433e-05, step_time=0.332 [2024-12-08 20:23:27,905] [INFO] [logging.py:129:log_dist] [Rank 0] step=580000, skipped=0, lr=[np.float64(6.432669663624277e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:23:27,906] [INFO] [timer.py:264:stop] epoch=0/micro_step=190000/global_step=190000, RunningAvgSamplesPerSec=49.218525608551, CurrSamplesPerSec=50.71486750573085, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:23:27,908 (deepspeed_trainer:228) INFO: 39epoch:train:9901-10000batch: iter_time=1.084e-04, loss_ctc=66.315, loss_att=51.467, acc=0.723, loss=55.950, grad_norm=6.100, loss_scale=1.000, learning_rate=6.433e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 20:24:02,221 (deepspeed_trainer:228) INFO: 39epoch:train:10001-10100batch: iter_time=1.102e-04, loss_ctc=77.371, loss_att=54.295, acc=0.724, loss=61.232, grad_norm=5.532, loss_scale=1.000, learning_rate=6.432e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-08 20:24:35,634 (deepspeed_trainer:228) INFO: 39epoch:train:10101-10200batch: iter_time=1.087e-04, loss_ctc=66.694, loss_att=44.951, acc=0.734, loss=51.468, grad_norm=5.636, loss_scale=1.000, learning_rate=6.432e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:25:09,157 (deepspeed_trainer:228) INFO: 39epoch:train:10201-10300batch: iter_time=1.133e-04, loss_ctc=74.875, loss_att=59.161, acc=0.717, loss=63.891, grad_norm=6.301, loss_scale=1.000, learning_rate=6.431e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:25:42,243 (deepspeed_trainer:228) INFO: 39epoch:train:10301-10400batch: iter_time=1.089e-04, loss_ctc=75.893, loss_att=53.925, acc=0.727, loss=60.503, grad_norm=6.468, loss_scale=1.000, learning_rate=6.431e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:26:15,404 (deepspeed_trainer:228) INFO: 39epoch:train:10401-10500batch: iter_time=1.126e-04, loss_ctc=69.344, loss_att=52.624, acc=0.728, loss=57.623, grad_norm=5.385, loss_scale=1.000, learning_rate=6.430e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:26:48,648 (deepspeed_trainer:228) INFO: 39epoch:train:10501-10600batch: iter_time=1.122e-04, loss_ctc=68.998, loss_att=48.984, acc=0.733, loss=55.000, grad_norm=5.268, loss_scale=1.000, learning_rate=6.430e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:27:21,975 (deepspeed_trainer:228) INFO: 39epoch:train:10601-10700batch: iter_time=1.075e-04, loss_ctc=70.032, loss_att=52.927, acc=0.716, loss=58.104, grad_norm=5.396, loss_scale=1.000, learning_rate=6.429e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:27:55,220 (deepspeed_trainer:228) INFO: 39epoch:train:10701-10800batch: iter_time=1.082e-04, loss_ctc=73.124, loss_att=55.711, acc=0.728, loss=60.930, grad_norm=6.044, loss_scale=1.000, learning_rate=6.429e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:28:28,711 (deepspeed_trainer:228) INFO: 39epoch:train:10801-10900batch: iter_time=1.157e-04, loss_ctc=72.999, loss_att=55.344, acc=0.743, loss=60.626, grad_norm=5.377, loss_scale=1.000, learning_rate=6.428e-05, step_time=0.334 [2024-12-08 20:29:01,721] [INFO] [logging.py:129:log_dist] [Rank 0] step=581000, skipped=0, lr=[np.float64(6.427131428773043e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:29:01,721] [INFO] [timer.py:264:stop] epoch=0/micro_step=191000/global_step=191000, RunningAvgSamplesPerSec=49.21976995433945, CurrSamplesPerSec=51.042703706305495, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:29:01,724 (deepspeed_trainer:228) INFO: 39epoch:train:10901-11000batch: iter_time=1.083e-04, loss_ctc=66.748, loss_att=47.508, acc=0.723, loss=53.274, grad_norm=5.430, loss_scale=1.000, learning_rate=6.427e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:29:36,111 (deepspeed_trainer:228) INFO: 39epoch:train:11001-11100batch: iter_time=1.080e-04, loss_ctc=64.688, loss_att=44.118, acc=0.732, loss=50.275, grad_norm=5.062, loss_scale=1.000, learning_rate=6.427e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-08 20:30:09,614 (deepspeed_trainer:228) INFO: 39epoch:train:11101-11200batch: iter_time=1.076e-04, loss_ctc=77.311, loss_att=61.351, acc=0.721, loss=66.123, grad_norm=6.466, loss_scale=1.000, learning_rate=6.426e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:30:29,795 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 20:30:57,112 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 20:31:13,495 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 20:31:13,496 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 20:31:13,498 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 20:31:39,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:39,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:40,503] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:41,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:41,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:42,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:41,958] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:42,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:42,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:42,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:43,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:43,650] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:43,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:43,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:43,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:31:44,023] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:29,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:29,925] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:30,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:31,478] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:31,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:32,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:32,426] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:32,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:33,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:33,289] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:33,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:33,635] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:33,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:33,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:36,242] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:32:36,921] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:17,062] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:17,845] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:20,390] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:20,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:21,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:21,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:22,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:22,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:22,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:22,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:23,143] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:23,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:23,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:23,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:26,640] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:33:26,918] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:04,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:05,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:10,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:10,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:11,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:11,572] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:11,773] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:12,796] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:14,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:14,089] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:13,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:14,648] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:14,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:14,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:15,945] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:34:17,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 20:35:00,893 (deepspeed_trainer:228) INFO: 39epoch:train:11201-11300batch: iter_time=2.540, loss_ctc=69.844, loss_att=58.330, acc=0.719, loss=61.762, grad_norm=5.781, loss_scale=1.000, learning_rate=6.426e-05, step_time=0.372 [cnode7-012:0/16] 2024-12-08 20:35:34,087 (deepspeed_trainer:228) INFO: 39epoch:train:11301-11400batch: iter_time=1.081e-04, loss_ctc=67.728, loss_att=55.167, acc=0.720, loss=58.968, grad_norm=5.794, loss_scale=1.000, learning_rate=6.425e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:36:07,329 (deepspeed_trainer:228) INFO: 39epoch:train:11401-11500batch: iter_time=1.092e-04, loss_ctc=59.846, loss_att=41.338, acc=0.735, loss=46.892, grad_norm=5.489, loss_scale=1.000, learning_rate=6.425e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:36:40,765 (deepspeed_trainer:228) INFO: 39epoch:train:11501-11600batch: iter_time=1.090e-04, loss_ctc=74.991, loss_att=57.741, acc=0.726, loss=62.921, grad_norm=6.407, loss_scale=1.000, learning_rate=6.424e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:37:14,005 (deepspeed_trainer:228) INFO: 39epoch:train:11601-11700batch: iter_time=1.089e-04, loss_ctc=67.360, loss_att=46.379, acc=0.743, loss=52.680, grad_norm=5.558, loss_scale=1.000, learning_rate=6.424e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:37:47,712 (deepspeed_trainer:228) INFO: 39epoch:train:11701-11800batch: iter_time=1.076e-04, loss_ctc=72.961, loss_att=58.860, acc=0.722, loss=63.089, grad_norm=6.883, loss_scale=1.000, learning_rate=6.423e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 20:38:21,089 (deepspeed_trainer:228) INFO: 39epoch:train:11801-11900batch: iter_time=1.099e-04, loss_ctc=67.620, loss_att=54.246, acc=0.723, loss=58.241, grad_norm=5.828, loss_scale=1.000, learning_rate=6.422e-05, step_time=0.333 [2024-12-08 20:38:54,539] [INFO] [logging.py:129:log_dist] [Rank 0] step=582000, skipped=0, lr=[np.float64(6.421607473836816e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:38:54,540] [INFO] [timer.py:264:stop] epoch=0/micro_step=192000/global_step=192000, RunningAvgSamplesPerSec=49.21675021697333, CurrSamplesPerSec=50.76139929946188, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:38:54,543 (deepspeed_trainer:228) INFO: 39epoch:train:11901-12000batch: iter_time=1.124e-04, loss_ctc=72.321, loss_att=52.061, acc=0.728, loss=58.149, grad_norm=5.908, loss_scale=1.000, learning_rate=6.422e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:39:28,028 (deepspeed_trainer:228) INFO: 39epoch:train:12001-12100batch: iter_time=1.075e-04, loss_ctc=72.123, loss_att=48.073, acc=0.736, loss=55.323, grad_norm=6.298, loss_scale=1.000, learning_rate=6.421e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:40:01,999 (deepspeed_trainer:228) INFO: 39epoch:train:12101-12200batch: iter_time=1.080e-04, loss_ctc=79.569, loss_att=66.584, acc=0.720, loss=70.483, grad_norm=6.210, loss_scale=1.000, learning_rate=6.421e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 20:40:35,573 (deepspeed_trainer:228) INFO: 39epoch:train:12201-12300batch: iter_time=1.092e-04, loss_ctc=67.115, loss_att=48.287, acc=0.738, loss=53.924, grad_norm=5.359, loss_scale=1.000, learning_rate=6.420e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:41:09,350 (deepspeed_trainer:228) INFO: 39epoch:train:12301-12400batch: iter_time=1.077e-04, loss_ctc=79.227, loss_att=56.677, acc=0.735, loss=63.448, grad_norm=6.030, loss_scale=1.000, learning_rate=6.420e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 20:41:42,684 (deepspeed_trainer:228) INFO: 39epoch:train:12401-12500batch: iter_time=1.091e-04, loss_ctc=61.464, loss_att=49.439, acc=0.728, loss=53.064, grad_norm=5.586, loss_scale=1.000, learning_rate=6.419e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:42:16,224 (deepspeed_trainer:228) INFO: 39epoch:train:12501-12600batch: iter_time=1.135e-04, loss_ctc=70.571, loss_att=54.918, acc=0.728, loss=59.634, grad_norm=6.043, loss_scale=1.000, learning_rate=6.419e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:42:49,966 (deepspeed_trainer:228) INFO: 39epoch:train:12601-12700batch: iter_time=1.091e-04, loss_ctc=76.836, loss_att=57.120, acc=0.733, loss=63.018, grad_norm=6.262, loss_scale=1.000, learning_rate=6.418e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 20:43:23,509 (deepspeed_trainer:228) INFO: 39epoch:train:12701-12800batch: iter_time=1.083e-04, loss_ctc=70.195, loss_att=54.259, acc=0.740, loss=59.045, grad_norm=5.196, loss_scale=1.000, learning_rate=6.417e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:43:56,676 (deepspeed_trainer:228) INFO: 39epoch:train:12801-12900batch: iter_time=1.073e-04, loss_ctc=67.939, loss_att=47.139, acc=0.730, loss=53.361, grad_norm=5.921, loss_scale=1.000, learning_rate=6.417e-05, step_time=0.331 [2024-12-08 20:44:29,672] [INFO] [logging.py:129:log_dist] [Rank 0] step=583000, skipped=0, lr=[np.float64(6.416097737554784e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:44:29,672] [INFO] [timer.py:264:stop] epoch=0/micro_step=193000/global_step=193000, RunningAvgSamplesPerSec=49.216569263459334, CurrSamplesPerSec=52.630225977186576, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:44:29,675 (deepspeed_trainer:228) INFO: 39epoch:train:12901-13000batch: iter_time=1.086e-04, loss_ctc=63.358, loss_att=46.008, acc=0.734, loss=51.211, grad_norm=6.399, loss_scale=1.000, learning_rate=6.416e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:45:03,347 (deepspeed_trainer:228) INFO: 39epoch:train:13001-13100batch: iter_time=1.088e-04, loss_ctc=73.438, loss_att=61.518, acc=0.717, loss=65.085, grad_norm=5.371, loss_scale=1.000, learning_rate=6.416e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 20:45:15,412 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 20:45:42,807 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 20:46:00,598 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 20:46:00,598 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 20:46:00,601 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 20:46:23,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:24,548] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:24,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:25,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:25,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:26,506] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:25,988] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:27,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:27,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:26,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:27,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:28,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:27,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:28,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:29,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:46:30,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:13,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:13,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:14,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:14,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:15,368] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:16,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:16,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:16,829] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:16,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:16,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:17,699] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:17,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:18,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:18,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:18,991] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:47:20,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:01,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:03,013] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:03,447] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:03,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:04,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:05,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:06,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:05,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:06,720] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:06,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:07,273] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:08,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:09,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:09,678] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:09,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:11,803] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:51,449] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:52,193] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:52,476] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:52,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:55,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:54,869] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:55,667] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:54,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:55,407] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:56,211] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:57,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:58,624] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:58,661] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:58,965] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:48:58,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 20:49:00,669] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 20:49:54,840 (deepspeed_trainer:228) INFO: 39epoch:train:13101-13200batch: iter_time=2.558, loss_ctc=73.023, loss_att=57.535, acc=0.717, loss=62.191, grad_norm=6.095, loss_scale=1.000, learning_rate=6.415e-05, step_time=0.356 [cnode7-012:0/16] 2024-12-08 20:50:28,810 (deepspeed_trainer:228) INFO: 39epoch:train:13201-13300batch: iter_time=1.172e-04, loss_ctc=64.476, loss_att=48.589, acc=0.735, loss=53.335, grad_norm=5.409, loss_scale=1.000, learning_rate=6.415e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 20:51:02,516 (deepspeed_trainer:228) INFO: 39epoch:train:13301-13400batch: iter_time=1.080e-04, loss_ctc=66.657, loss_att=48.000, acc=0.724, loss=53.593, grad_norm=5.969, loss_scale=1.000, learning_rate=6.414e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 20:51:35,789 (deepspeed_trainer:228) INFO: 39epoch:train:13401-13500batch: iter_time=1.066e-04, loss_ctc=69.374, loss_att=50.486, acc=0.721, loss=56.138, grad_norm=6.373, loss_scale=1.000, learning_rate=6.414e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:52:09,133 (deepspeed_trainer:228) INFO: 39epoch:train:13501-13600batch: iter_time=1.125e-04, loss_ctc=64.636, loss_att=46.616, acc=0.733, loss=52.042, grad_norm=5.289, loss_scale=1.000, learning_rate=6.413e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:52:42,711 (deepspeed_trainer:228) INFO: 39epoch:train:13601-13700batch: iter_time=1.085e-04, loss_ctc=76.412, loss_att=59.898, acc=0.721, loss=64.845, grad_norm=6.514, loss_scale=1.000, learning_rate=6.413e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:53:15,733 (deepspeed_trainer:228) INFO: 39epoch:train:13701-13800batch: iter_time=1.096e-04, loss_ctc=71.096, loss_att=53.024, acc=0.722, loss=58.439, grad_norm=5.887, loss_scale=1.000, learning_rate=6.412e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:53:48,459 (deepspeed_trainer:228) INFO: 39epoch:train:13801-13900batch: iter_time=1.099e-04, loss_ctc=65.852, loss_att=46.392, acc=0.730, loss=52.259, grad_norm=5.997, loss_scale=1.000, learning_rate=6.411e-05, step_time=0.327 [2024-12-08 20:54:21,629] [INFO] [logging.py:129:log_dist] [Rank 0] step=584000, skipped=0, lr=[np.float64(6.410602159033427e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:54:21,630] [INFO] [timer.py:264:stop] epoch=0/micro_step=194000/global_step=194000, RunningAvgSamplesPerSec=49.215612082711885, CurrSamplesPerSec=51.471581617226065, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:54:21,633 (deepspeed_trainer:228) INFO: 39epoch:train:13901-14000batch: iter_time=1.096e-04, loss_ctc=79.236, loss_att=58.625, acc=0.719, loss=64.797, grad_norm=6.161, loss_scale=1.000, learning_rate=6.411e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:54:54,700 (deepspeed_trainer:228) INFO: 39epoch:train:14001-14100batch: iter_time=1.078e-04, loss_ctc=73.980, loss_att=56.294, acc=0.727, loss=61.598, grad_norm=6.609, loss_scale=1.000, learning_rate=6.410e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 20:55:28,247 (deepspeed_trainer:228) INFO: 39epoch:train:14101-14200batch: iter_time=1.127e-04, loss_ctc=69.836, loss_att=50.542, acc=0.726, loss=56.329, grad_norm=5.511, loss_scale=1.000, learning_rate=6.410e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 20:56:01,548 (deepspeed_trainer:228) INFO: 39epoch:train:14201-14300batch: iter_time=1.129e-04, loss_ctc=73.297, loss_att=52.912, acc=0.734, loss=59.022, grad_norm=6.196, loss_scale=1.000, learning_rate=6.409e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 20:56:34,707 (deepspeed_trainer:228) INFO: 39epoch:train:14301-14400batch: iter_time=1.102e-04, loss_ctc=64.283, loss_att=49.037, acc=0.728, loss=53.639, grad_norm=5.493, loss_scale=1.000, learning_rate=6.409e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:57:07,917 (deepspeed_trainer:228) INFO: 39epoch:train:14401-14500batch: iter_time=1.083e-04, loss_ctc=74.224, loss_att=57.930, acc=0.718, loss=62.817, grad_norm=6.265, loss_scale=1.000, learning_rate=6.408e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:57:41,100 (deepspeed_trainer:228) INFO: 39epoch:train:14501-14600batch: iter_time=1.110e-04, loss_ctc=71.818, loss_att=51.288, acc=0.743, loss=57.430, grad_norm=5.888, loss_scale=1.000, learning_rate=6.408e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 20:58:14,239 (deepspeed_trainer:228) INFO: 39epoch:train:14601-14700batch: iter_time=1.109e-04, loss_ctc=68.433, loss_att=50.855, acc=0.726, loss=56.171, grad_norm=6.227, loss_scale=1.000, learning_rate=6.407e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:58:47,341 (deepspeed_trainer:228) INFO: 39epoch:train:14701-14800batch: iter_time=1.098e-04, loss_ctc=64.027, loss_att=44.819, acc=0.730, loss=50.583, grad_norm=5.166, loss_scale=1.000, learning_rate=6.406e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 20:59:21,030 (deepspeed_trainer:228) INFO: 39epoch:train:14801-14900batch: iter_time=1.070e-04, loss_ctc=71.672, loss_att=54.024, acc=0.723, loss=59.306, grad_norm=6.367, loss_scale=1.000, learning_rate=6.406e-05, step_time=0.337 [2024-12-08 20:59:54,119] [INFO] [logging.py:129:log_dist] [Rank 0] step=585000, skipped=0, lr=[np.float64(6.405120677743707e-05)], mom=[[0.9, 0.98]] [2024-12-08 20:59:54,119] [INFO] [timer.py:264:stop] epoch=0/micro_step=195000/global_step=195000, RunningAvgSamplesPerSec=49.21753970886558, CurrSamplesPerSec=54.10734536227899, MemAllocated=2.04GB, MaxMemAllocated=35.26GB [cnode7-012:0/16] 2024-12-08 20:59:54,122 (deepspeed_trainer:228) INFO: 39epoch:train:14901-15000batch: iter_time=1.095e-04, loss_ctc=72.289, loss_att=57.782, acc=0.720, loss=62.137, grad_norm=5.805, loss_scale=1.000, learning_rate=6.405e-05, step_time=0.331 [2024-12-08 21:00:09,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:10,040] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:10,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:10,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:10,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:10,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:10,620] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,804] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,847] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:09,903] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:25,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:24,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:25,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:25,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:26,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:25,980] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:26,191] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:26,265] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:26,316] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:27,455] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:27,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:26,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:27,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:26,932] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:27,838] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:27,850] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:41,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:40,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:41,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:40,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:41,861] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:41,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:41,581] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:42,746] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:42,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:42,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:43,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:43,509] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:42,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:42,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:43,954] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:43,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:56,000] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:56,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:56,881] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:56,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:56,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:57,883] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:57,076] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:58,145] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:57,358] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:58,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:57,813] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:58,334] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:59,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:59,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:58,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:00:58,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:01:12,120] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 39 is about to be saved! [2024-12-08 21:01:12,151] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/mp_rank_00_model_states.pt [2024-12-08 21:01:12,151] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/mp_rank_00_model_states.pt... [2024-12-08 21:01:14,124] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/mp_rank_00_model_states.pt. [2024-12-08 21:01:13,425] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,425] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,425] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,427] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,284] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 21:01:13,429] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,286] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,287] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,287] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,287] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,288] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,288] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 21:01:14,288] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 21:01:15,025] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,025] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,026] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,042] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,042] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,042] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,051] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,052] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,052] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,052] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,052] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,052] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,053] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,058] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,058] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,058] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,059] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,059] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,059] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,061] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,061] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,061] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 21:01:14,270] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 21:01:14,270] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 21:01:14,270] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:14,270] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:14,270] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,154] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,154] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,505] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,505] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,505] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:15,873] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 21:01:15,873] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 21:01:15,873] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:16,237] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 21:01:16,237] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 21:01:16,237] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [2024-12-08 21:01:16,240] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 21:01:16,241] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_39/39/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 21:01:16,241] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 39 is ready now! [cnode7-012:0/16] 2024-12-08 21:01:17,100 (deepspeed_trainer:158) INFO: 39epoch results: [train] iter_time=0.136, loss_ctc=70.858, loss_att=53.000, acc=0.726, loss=58.357, grad_norm=5.938, loss_scale=1.000, learning_rate=6.447e-05, step_time=0.336, time=1 hour, 58 minutes and 16.82 seconds, total_count=585039, gpu_max_cached_mem_GB=41.424, [valid] loss_ctc=3.375, cer_ctc=0.078, loss_att=5.156, acc=0.912, cer=0.074, wer=0.750, loss=4.625, time=1 minute and 8.88 seconds, total_count=39, gpu_max_cached_mem_GB=41.424 [cnode7-012:0/16] 2024-12-08 21:01:18,495 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 21:01:45,420 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 21:02:01,408 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 21:02:01,408 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 21:02:01,410 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 21:02:21,946] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:23,422] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:22,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:23,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:23,561] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:25,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:24,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:24,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:25,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:25,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:24,908] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:25,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:26,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:26,138] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:26,966] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:02:27,016] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:10,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:10,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:11,682] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:11,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:11,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:13,926] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:14,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:14,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:14,005] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:14,037] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:14,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:15,185] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:15,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:14,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:15,515] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:17,467] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:58,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:59,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:59,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:03:59,922] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:01,398] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:02,498] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:02,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:03,824] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:02,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:03,149] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:04,781] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:05,381] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:05,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:07,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:09,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:10,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:47,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:47,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:48,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:49,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:50,998] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:51,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:51,627] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:51,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:52,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:52,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:53,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:54,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:54,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:04:57,405] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:05:01,551] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:05:01,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 21:06:02,902 (deepspeed_trainer:228) INFO: 40epoch:train:1-100batch: iter_time=2.430, loss_ctc=75.767, loss_att=62.917, acc=0.695, loss=66.774, grad_norm=6.957, loss_scale=1.000, learning_rate=6.405e-05, step_time=0.415 [cnode7-012:0/16] 2024-12-08 21:06:36,580 (deepspeed_trainer:228) INFO: 40epoch:train:101-200batch: iter_time=1.097e-04, loss_ctc=69.254, loss_att=56.955, acc=0.720, loss=60.606, grad_norm=5.618, loss_scale=1.000, learning_rate=6.404e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 21:07:10,078 (deepspeed_trainer:228) INFO: 40epoch:train:201-300batch: iter_time=1.105e-04, loss_ctc=73.066, loss_att=57.036, acc=0.719, loss=61.857, grad_norm=5.694, loss_scale=1.000, learning_rate=6.404e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 21:07:43,306 (deepspeed_trainer:228) INFO: 40epoch:train:301-400batch: iter_time=1.155e-04, loss_ctc=75.086, loss_att=55.381, acc=0.709, loss=61.309, grad_norm=6.204, loss_scale=1.000, learning_rate=6.403e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 21:08:16,500 (deepspeed_trainer:228) INFO: 40epoch:train:401-500batch: iter_time=1.115e-04, loss_ctc=68.288, loss_att=52.231, acc=0.730, loss=57.044, grad_norm=5.875, loss_scale=1.000, learning_rate=6.403e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:08:49,461 (deepspeed_trainer:228) INFO: 40epoch:train:501-600batch: iter_time=1.133e-04, loss_ctc=66.493, loss_att=49.910, acc=0.714, loss=54.903, grad_norm=5.907, loss_scale=1.000, learning_rate=6.402e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 21:09:22,729 (deepspeed_trainer:228) INFO: 40epoch:train:601-700batch: iter_time=1.140e-04, loss_ctc=74.671, loss_att=53.841, acc=0.710, loss=60.082, grad_norm=6.790, loss_scale=1.000, learning_rate=6.402e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:09:55,928 (deepspeed_trainer:228) INFO: 40epoch:train:701-800batch: iter_time=1.100e-04, loss_ctc=71.544, loss_att=56.288, acc=0.708, loss=60.876, grad_norm=6.261, loss_scale=1.000, learning_rate=6.401e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:10:28,850 (deepspeed_trainer:228) INFO: 40epoch:train:801-900batch: iter_time=1.117e-04, loss_ctc=63.230, loss_att=47.323, acc=0.728, loss=52.080, grad_norm=5.449, loss_scale=1.000, learning_rate=6.400e-05, step_time=0.329 [2024-12-08 21:11:02,206] [INFO] [logging.py:129:log_dist] [Rank 0] step=586000, skipped=0, lr=[np.float64(6.39965323351826e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:11:02,207] [INFO] [timer.py:264:stop] epoch=0/micro_step=196000/global_step=196000, RunningAvgSamplesPerSec=49.21349782289236, CurrSamplesPerSec=48.93584379608172, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:11:02,210 (deepspeed_trainer:228) INFO: 40epoch:train:901-1000batch: iter_time=1.155e-04, loss_ctc=80.118, loss_att=50.562, acc=0.734, loss=59.406, grad_norm=6.101, loss_scale=1.000, learning_rate=6.400e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:11:35,322 (deepspeed_trainer:228) INFO: 40epoch:train:1001-1100batch: iter_time=1.107e-04, loss_ctc=74.003, loss_att=52.107, acc=0.726, loss=58.670, grad_norm=5.606, loss_scale=1.000, learning_rate=6.399e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:12:08,693 (deepspeed_trainer:228) INFO: 40epoch:train:1101-1200batch: iter_time=1.120e-04, loss_ctc=66.199, loss_att=49.553, acc=0.738, loss=54.525, grad_norm=5.049, loss_scale=1.000, learning_rate=6.399e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:12:41,998 (deepspeed_trainer:228) INFO: 40epoch:train:1201-1300batch: iter_time=1.163e-04, loss_ctc=74.935, loss_att=58.582, acc=0.718, loss=63.510, grad_norm=6.051, loss_scale=1.000, learning_rate=6.398e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 21:13:15,104 (deepspeed_trainer:228) INFO: 40epoch:train:1301-1400batch: iter_time=1.135e-04, loss_ctc=65.123, loss_att=54.237, acc=0.716, loss=57.474, grad_norm=5.356, loss_scale=1.000, learning_rate=6.398e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:13:48,181 (deepspeed_trainer:228) INFO: 40epoch:train:1401-1500batch: iter_time=1.135e-04, loss_ctc=69.384, loss_att=49.115, acc=0.727, loss=55.171, grad_norm=5.788, loss_scale=1.000, learning_rate=6.397e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 21:14:21,840 (deepspeed_trainer:228) INFO: 40epoch:train:1501-1600batch: iter_time=1.154e-04, loss_ctc=73.506, loss_att=52.192, acc=0.727, loss=58.616, grad_norm=6.247, loss_scale=1.000, learning_rate=6.397e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 21:14:55,376 (deepspeed_trainer:228) INFO: 40epoch:train:1601-1700batch: iter_time=1.140e-04, loss_ctc=70.976, loss_att=53.454, acc=0.725, loss=58.710, grad_norm=6.260, loss_scale=1.000, learning_rate=6.396e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 21:15:28,279 (deepspeed_trainer:228) INFO: 40epoch:train:1701-1800batch: iter_time=1.151e-04, loss_ctc=58.967, loss_att=45.603, acc=0.730, loss=49.618, grad_norm=5.063, loss_scale=1.000, learning_rate=6.396e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 21:15:57,818 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 21:16:25,056 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 21:16:42,472 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.1", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.1", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.1", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.1", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 21:16:42,472 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.1, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 21:16:42,474 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 21:17:10,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:11,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:10,524] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:10,646] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:11,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:11,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:10,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:10,732] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:10,733] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:10,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:11,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:12,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:12,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:12,567] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:12,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:17:12,916] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:00,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:00,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:01,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:02,015] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:02,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:02,130] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:01,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:02,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:01,666] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:01,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:02,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:02,362] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:03,085] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:03,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:03,415] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:05,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:50,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:51,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:51,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:52,098] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:53,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:52,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:53,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:52,671] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:53,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:52,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:53,902] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:53,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:54,178] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:53,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:54,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:18:58,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:38,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:39,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:40,033] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:40,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:40,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:42,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:42,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:41,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:42,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:43,220] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:42,350] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:42,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:43,116] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:44,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:44,894] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:19:49,272] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 21:20:23,485 (deepspeed_trainer:228) INFO: 40epoch:train:1801-1900batch: iter_time=2.562, loss_ctc=83.599, loss_att=68.971, acc=0.709, loss=73.387, grad_norm=7.045, loss_scale=1.000, learning_rate=6.395e-05, step_time=0.390 [2024-12-08 21:20:56,781] [INFO] [logging.py:129:log_dist] [Rank 0] step=587000, skipped=0, lr=[np.float64(6.394199766548618e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:20:56,782] [INFO] [timer.py:264:stop] epoch=0/micro_step=197000/global_step=197000, RunningAvgSamplesPerSec=49.21124262422963, CurrSamplesPerSec=50.3245574848369, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:20:56,785 (deepspeed_trainer:228) INFO: 40epoch:train:1901-2000batch: iter_time=1.064e-04, loss_ctc=71.994, loss_att=58.036, acc=0.709, loss=62.218, grad_norm=5.695, loss_scale=1.000, learning_rate=6.394e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:21:30,176 (deepspeed_trainer:228) INFO: 40epoch:train:2001-2100batch: iter_time=1.033e-04, loss_ctc=69.948, loss_att=57.540, acc=0.719, loss=61.246, grad_norm=5.668, loss_scale=1.000, learning_rate=6.394e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 21:22:03,326 (deepspeed_trainer:228) INFO: 40epoch:train:2101-2200batch: iter_time=1.077e-04, loss_ctc=72.392, loss_att=54.442, acc=0.722, loss=59.824, grad_norm=6.149, loss_scale=1.000, learning_rate=6.393e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:22:36,395 (deepspeed_trainer:228) INFO: 40epoch:train:2201-2300batch: iter_time=1.074e-04, loss_ctc=66.858, loss_att=49.669, acc=0.718, loss=54.811, grad_norm=5.633, loss_scale=1.000, learning_rate=6.393e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 21:23:09,504 (deepspeed_trainer:228) INFO: 40epoch:train:2301-2400batch: iter_time=1.096e-04, loss_ctc=71.271, loss_att=55.877, acc=0.722, loss=60.492, grad_norm=5.926, loss_scale=1.000, learning_rate=6.392e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:23:42,253 (deepspeed_trainer:228) INFO: 40epoch:train:2401-2500batch: iter_time=1.099e-04, loss_ctc=63.588, loss_att=45.818, acc=0.724, loss=51.148, grad_norm=5.608, loss_scale=1.000, learning_rate=6.392e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 21:24:15,302 (deepspeed_trainer:228) INFO: 40epoch:train:2501-2600batch: iter_time=1.096e-04, loss_ctc=80.654, loss_att=60.966, acc=0.706, loss=66.857, grad_norm=6.845, loss_scale=1.000, learning_rate=6.391e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 21:24:48,028 (deepspeed_trainer:228) INFO: 40epoch:train:2601-2700batch: iter_time=1.064e-04, loss_ctc=65.584, loss_att=52.302, acc=0.710, loss=56.310, grad_norm=6.511, loss_scale=1.000, learning_rate=6.391e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 21:25:21,870 (deepspeed_trainer:228) INFO: 40epoch:train:2701-2800batch: iter_time=1.077e-04, loss_ctc=63.583, loss_att=46.406, acc=0.730, loss=51.567, grad_norm=5.204, loss_scale=1.000, learning_rate=6.390e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 21:25:55,211 (deepspeed_trainer:228) INFO: 40epoch:train:2801-2900batch: iter_time=1.097e-04, loss_ctc=80.754, loss_att=52.416, acc=0.738, loss=60.886, grad_norm=5.806, loss_scale=1.000, learning_rate=6.390e-05, step_time=0.333 [2024-12-08 21:26:27,797] [INFO] [logging.py:129:log_dist] [Rank 0] step=588000, skipped=0, lr=[np.float64(6.388760217382477e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:26:27,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=198000/global_step=198000, RunningAvgSamplesPerSec=49.2142389878226, CurrSamplesPerSec=51.73421982819271, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:26:27,801 (deepspeed_trainer:228) INFO: 40epoch:train:2901-3000batch: iter_time=1.085e-04, loss_ctc=68.980, loss_att=48.702, acc=0.730, loss=54.793, grad_norm=5.874, loss_scale=1.000, learning_rate=6.389e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 21:27:00,958 (deepspeed_trainer:228) INFO: 40epoch:train:3001-3100batch: iter_time=1.083e-04, loss_ctc=74.950, loss_att=57.462, acc=0.730, loss=62.672, grad_norm=5.738, loss_scale=1.000, learning_rate=6.388e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:27:33,659 (deepspeed_trainer:228) INFO: 40epoch:train:3101-3200batch: iter_time=1.081e-04, loss_ctc=67.547, loss_att=51.235, acc=0.726, loss=56.123, grad_norm=5.326, loss_scale=1.000, learning_rate=6.388e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 21:28:06,352 (deepspeed_trainer:228) INFO: 40epoch:train:3201-3300batch: iter_time=1.072e-04, loss_ctc=66.317, loss_att=53.318, acc=0.724, loss=57.218, grad_norm=5.606, loss_scale=1.000, learning_rate=6.387e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 21:28:38,897 (deepspeed_trainer:228) INFO: 40epoch:train:3301-3400batch: iter_time=1.058e-04, loss_ctc=66.424, loss_att=47.934, acc=0.726, loss=53.495, grad_norm=5.694, loss_scale=1.000, learning_rate=6.387e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 21:29:11,604 (deepspeed_trainer:228) INFO: 40epoch:train:3401-3500batch: iter_time=1.069e-04, loss_ctc=74.285, loss_att=53.531, acc=0.732, loss=59.790, grad_norm=5.854, loss_scale=1.000, learning_rate=6.386e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 21:29:44,295 (deepspeed_trainer:228) INFO: 40epoch:train:3501-3600batch: iter_time=1.085e-04, loss_ctc=67.445, loss_att=51.936, acc=0.725, loss=56.604, grad_norm=6.235, loss_scale=1.000, learning_rate=6.386e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 21:30:17,147 (deepspeed_trainer:228) INFO: 40epoch:train:3601-3700batch: iter_time=1.080e-04, loss_ctc=62.984, loss_att=50.029, acc=0.724, loss=53.916, grad_norm=6.028, loss_scale=1.000, learning_rate=6.385e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 21:30:37,996 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 21:31:04,540 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 21:31:20,420 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 21:31:20,421 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 21:31:20,423 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 21:31:44,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:46,304] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:46,347] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:46,659] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:47,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:48,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:47,846] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:49,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:48,439] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:49,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:49,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:51,028] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:51,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:51,229] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:51,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:31:52,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:32,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:34,507] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:36,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:35,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:35,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:36,858] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:37,139] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:38,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:38,153] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:39,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:38,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:40,391] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:40,489] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:40,839] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:41,758] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:32:42,480] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:20,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:22,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:24,227] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:24,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:26,512] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:25,411] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:25,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:26,427] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:28,795] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:27,529] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:29,652] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:29,759] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:30,382] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:30,698] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:31,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:33:34,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:07,466] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:10,392] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:12,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:12,271] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:13,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:12,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:13,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:14,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:15,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:17,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:19,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:19,799] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:19,840] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:20,095] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:21,613] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:34:24,035] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 21:35:08,483 (deepspeed_trainer:228) INFO: 40epoch:train:3701-3800batch: iter_time=2.469, loss_ctc=81.563, loss_att=66.672, acc=0.704, loss=71.126, grad_norm=6.550, loss_scale=1.000, learning_rate=6.385e-05, step_time=0.444 [cnode7-012:0/16] 2024-12-08 21:35:43,059 (deepspeed_trainer:228) INFO: 40epoch:train:3801-3900batch: iter_time=1.128e-04, loss_ctc=67.696, loss_att=56.431, acc=0.716, loss=59.824, grad_norm=5.795, loss_scale=1.000, learning_rate=6.384e-05, step_time=0.345 [2024-12-08 21:36:17,266] [INFO] [logging.py:129:log_dist] [Rank 0] step=589000, skipped=0, lr=[np.float64(6.383334526920957e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:36:17,267] [INFO] [timer.py:264:stop] epoch=0/micro_step=199000/global_step=199000, RunningAvgSamplesPerSec=49.20833007768234, CurrSamplesPerSec=50.15571950474442, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:36:17,270 (deepspeed_trainer:228) INFO: 40epoch:train:3901-4000batch: iter_time=1.159e-04, loss_ctc=68.882, loss_att=55.777, acc=0.731, loss=59.697, grad_norm=5.296, loss_scale=1.000, learning_rate=6.384e-05, step_time=0.343 [cnode7-012:0/16] 2024-12-08 21:36:50,599 (deepspeed_trainer:228) INFO: 40epoch:train:4001-4100batch: iter_time=1.103e-04, loss_ctc=80.574, loss_att=58.703, acc=0.712, loss=65.263, grad_norm=6.222, loss_scale=1.000, learning_rate=6.383e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:37:24,313 (deepspeed_trainer:228) INFO: 40epoch:train:4101-4200batch: iter_time=1.078e-04, loss_ctc=63.740, loss_att=48.792, acc=0.727, loss=53.273, grad_norm=5.850, loss_scale=1.000, learning_rate=6.383e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 21:37:57,574 (deepspeed_trainer:228) INFO: 40epoch:train:4201-4300batch: iter_time=1.072e-04, loss_ctc=67.737, loss_att=50.554, acc=0.725, loss=55.738, grad_norm=5.487, loss_scale=1.000, learning_rate=6.382e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:38:30,567 (deepspeed_trainer:228) INFO: 40epoch:train:4301-4400batch: iter_time=1.075e-04, loss_ctc=64.554, loss_att=48.345, acc=0.715, loss=53.215, grad_norm=6.249, loss_scale=1.000, learning_rate=6.381e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 21:39:04,179 (deepspeed_trainer:228) INFO: 40epoch:train:4401-4500batch: iter_time=1.086e-04, loss_ctc=81.604, loss_att=60.143, acc=0.712, loss=66.574, grad_norm=6.179, loss_scale=1.000, learning_rate=6.381e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 21:39:37,039 (deepspeed_trainer:228) INFO: 40epoch:train:4501-4600batch: iter_time=1.105e-04, loss_ctc=62.911, loss_att=48.376, acc=0.718, loss=52.717, grad_norm=5.413, loss_scale=1.000, learning_rate=6.380e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 21:40:09,990 (deepspeed_trainer:228) INFO: 40epoch:train:4601-4700batch: iter_time=1.101e-04, loss_ctc=66.754, loss_att=47.567, acc=0.739, loss=53.301, grad_norm=5.151, loss_scale=1.000, learning_rate=6.380e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 21:40:43,051 (deepspeed_trainer:228) INFO: 40epoch:train:4701-4800batch: iter_time=1.111e-04, loss_ctc=79.706, loss_att=53.877, acc=0.735, loss=61.626, grad_norm=5.613, loss_scale=1.000, learning_rate=6.379e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 21:41:16,020 (deepspeed_trainer:228) INFO: 40epoch:train:4801-4900batch: iter_time=1.152e-04, loss_ctc=67.252, loss_att=46.838, acc=0.735, loss=52.964, grad_norm=5.112, loss_scale=1.000, learning_rate=6.379e-05, step_time=0.329 [2024-12-08 21:41:49,163] [INFO] [logging.py:129:log_dist] [Rank 0] step=590000, skipped=0, lr=[np.float64(6.377922636415913e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:41:49,164] [INFO] [timer.py:264:stop] epoch=0/micro_step=200000/global_step=200000, RunningAvgSamplesPerSec=49.210490036558085, CurrSamplesPerSec=47.004147072683764, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:41:49,167 (deepspeed_trainer:228) INFO: 40epoch:train:4901-5000batch: iter_time=1.164e-04, loss_ctc=74.131, loss_att=56.232, acc=0.727, loss=61.601, grad_norm=5.618, loss_scale=1.000, learning_rate=6.378e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:42:23,211 (deepspeed_trainer:228) INFO: 40epoch:train:5001-5100batch: iter_time=1.188e-04, loss_ctc=66.364, loss_att=53.787, acc=0.718, loss=57.567, grad_norm=5.776, loss_scale=1.000, learning_rate=6.378e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 21:42:56,612 (deepspeed_trainer:228) INFO: 40epoch:train:5101-5200batch: iter_time=1.106e-04, loss_ctc=70.101, loss_att=52.546, acc=0.731, loss=57.821, grad_norm=5.462, loss_scale=1.000, learning_rate=6.377e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 21:43:29,678 (deepspeed_trainer:228) INFO: 40epoch:train:5201-5300batch: iter_time=1.188e-04, loss_ctc=64.387, loss_att=46.498, acc=0.724, loss=51.849, grad_norm=5.352, loss_scale=1.000, learning_rate=6.377e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 21:44:02,944 (deepspeed_trainer:228) INFO: 40epoch:train:5301-5400batch: iter_time=1.233e-04, loss_ctc=73.071, loss_att=54.996, acc=0.728, loss=60.411, grad_norm=6.454, loss_scale=1.000, learning_rate=6.376e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 21:44:36,268 (deepspeed_trainer:228) INFO: 40epoch:train:5401-5500batch: iter_time=1.147e-04, loss_ctc=64.901, loss_att=49.341, acc=0.735, loss=54.009, grad_norm=5.046, loss_scale=1.000, learning_rate=6.375e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:45:09,599 (deepspeed_trainer:228) INFO: 40epoch:train:5501-5600batch: iter_time=1.179e-04, loss_ctc=68.344, loss_att=52.476, acc=0.727, loss=57.224, grad_norm=6.444, loss_scale=1.000, learning_rate=6.375e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:45:21,509 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 21:45:48,541 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 21:46:05,883 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 21:46:05,883 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 21:46:05,886 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 21:46:29,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:29,252] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:29,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:29,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:31,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:32,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:31,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:31,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:33,686] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:33,959] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:34,067] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:32,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:34,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:35,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:35,183] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:46:35,680] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:18,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:19,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:19,459] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:19,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:20,165] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:23,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:21,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:22,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:22,670] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:24,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:24,848] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:25,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:25,288] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:25,543] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:25,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:47:26,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:07,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:08,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:09,041] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:09,243] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:09,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:11,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:11,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:13,726] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:12,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:14,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:12,776] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:15,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:15,327] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:15,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:15,704] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:15,743] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:55,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:56,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:57,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:58,266] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:48:59,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:01,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:00,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:00,156] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:01,325] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:03,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:03,952] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:04,714] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:05,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:05,947] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:05,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 21:49:08,944] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 21:50:03,044 (deepspeed_trainer:228) INFO: 40epoch:train:5601-5700batch: iter_time=2.543, loss_ctc=77.439, loss_att=67.617, acc=0.703, loss=70.591, grad_norm=6.481, loss_scale=1.000, learning_rate=6.374e-05, step_time=0.392 [cnode7-012:0/16] 2024-12-08 21:50:37,122 (deepspeed_trainer:228) INFO: 40epoch:train:5701-5800batch: iter_time=1.067e-04, loss_ctc=72.390, loss_att=63.170, acc=0.720, loss=65.915, grad_norm=5.833, loss_scale=1.000, learning_rate=6.374e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-08 21:51:10,465 (deepspeed_trainer:228) INFO: 40epoch:train:5801-5900batch: iter_time=1.109e-04, loss_ctc=69.719, loss_att=53.839, acc=0.732, loss=58.602, grad_norm=5.409, loss_scale=1.000, learning_rate=6.373e-05, step_time=0.334 [2024-12-08 21:51:43,715] [INFO] [logging.py:129:log_dist] [Rank 0] step=591000, skipped=0, lr=[np.float64(6.37252448746727e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:51:43,715] [INFO] [timer.py:264:stop] epoch=0/micro_step=201000/global_step=201000, RunningAvgSamplesPerSec=49.20668204514737, CurrSamplesPerSec=49.721974760589106, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:51:43,718 (deepspeed_trainer:228) INFO: 40epoch:train:5901-6000batch: iter_time=1.081e-04, loss_ctc=74.728, loss_att=57.489, acc=0.727, loss=62.661, grad_norm=5.919, loss_scale=1.000, learning_rate=6.373e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 21:52:16,963 (deepspeed_trainer:228) INFO: 40epoch:train:6001-6100batch: iter_time=1.095e-04, loss_ctc=67.892, loss_att=54.195, acc=0.732, loss=58.303, grad_norm=5.585, loss_scale=1.000, learning_rate=6.372e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 21:52:50,091 (deepspeed_trainer:228) INFO: 40epoch:train:6101-6200batch: iter_time=1.114e-04, loss_ctc=64.411, loss_att=48.302, acc=0.727, loss=53.134, grad_norm=5.561, loss_scale=1.000, learning_rate=6.372e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:53:23,399 (deepspeed_trainer:228) INFO: 40epoch:train:6201-6300batch: iter_time=1.114e-04, loss_ctc=67.128, loss_att=50.035, acc=0.721, loss=55.178, grad_norm=6.376, loss_scale=1.000, learning_rate=6.371e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:53:56,669 (deepspeed_trainer:228) INFO: 40epoch:train:6301-6400batch: iter_time=1.180e-04, loss_ctc=75.476, loss_att=59.288, acc=0.712, loss=64.158, grad_norm=6.924, loss_scale=1.000, learning_rate=6.371e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 21:54:29,810 (deepspeed_trainer:228) INFO: 40epoch:train:6401-6500batch: iter_time=1.123e-04, loss_ctc=63.151, loss_att=46.812, acc=0.726, loss=51.727, grad_norm=5.635, loss_scale=1.000, learning_rate=6.370e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:55:02,944 (deepspeed_trainer:228) INFO: 40epoch:train:6501-6600batch: iter_time=1.088e-04, loss_ctc=75.974, loss_att=51.701, acc=0.743, loss=58.971, grad_norm=6.078, loss_scale=1.000, learning_rate=6.370e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:55:36,413 (deepspeed_trainer:228) INFO: 40epoch:train:6601-6700batch: iter_time=1.107e-04, loss_ctc=72.796, loss_att=54.001, acc=0.736, loss=59.657, grad_norm=5.601, loss_scale=1.000, learning_rate=6.369e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 21:56:09,790 (deepspeed_trainer:228) INFO: 40epoch:train:6701-6800batch: iter_time=1.125e-04, loss_ctc=64.363, loss_att=47.842, acc=0.741, loss=52.812, grad_norm=5.372, loss_scale=1.000, learning_rate=6.368e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 21:56:43,273 (deepspeed_trainer:228) INFO: 40epoch:train:6801-6900batch: iter_time=1.151e-04, loss_ctc=75.271, loss_att=60.812, acc=0.727, loss=65.162, grad_norm=5.721, loss_scale=1.000, learning_rate=6.368e-05, step_time=0.334 [2024-12-08 21:57:16,400] [INFO] [logging.py:129:log_dist] [Rank 0] step=592000, skipped=0, lr=[np.float64(6.367140022020358e-05)], mom=[[0.9, 0.98]] [2024-12-08 21:57:16,400] [INFO] [timer.py:264:stop] epoch=0/micro_step=202000/global_step=202000, RunningAvgSamplesPerSec=49.20839358609662, CurrSamplesPerSec=52.925756550227334, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 21:57:16,403 (deepspeed_trainer:228) INFO: 40epoch:train:6901-7000batch: iter_time=1.120e-04, loss_ctc=65.407, loss_att=52.838, acc=0.732, loss=56.625, grad_norm=5.315, loss_scale=1.000, learning_rate=6.367e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 21:57:49,470 (deepspeed_trainer:228) INFO: 40epoch:train:7001-7100batch: iter_time=1.103e-04, loss_ctc=67.711, loss_att=51.811, acc=0.729, loss=56.582, grad_norm=6.412, loss_scale=1.000, learning_rate=6.367e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 21:58:22,846 (deepspeed_trainer:228) INFO: 40epoch:train:7101-7200batch: iter_time=1.111e-04, loss_ctc=69.588, loss_att=51.915, acc=0.732, loss=57.202, grad_norm=6.041, loss_scale=1.000, learning_rate=6.366e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:58:56,196 (deepspeed_trainer:228) INFO: 40epoch:train:7201-7300batch: iter_time=1.122e-04, loss_ctc=70.103, loss_att=50.769, acc=0.730, loss=56.563, grad_norm=5.437, loss_scale=1.000, learning_rate=6.366e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 21:59:29,335 (deepspeed_trainer:228) INFO: 40epoch:train:7301-7400batch: iter_time=1.108e-04, loss_ctc=63.615, loss_att=49.339, acc=0.735, loss=53.631, grad_norm=6.037, loss_scale=1.000, learning_rate=6.365e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:00:02,416 (deepspeed_trainer:228) INFO: 40epoch:train:7401-7500batch: iter_time=1.071e-04, loss_ctc=73.774, loss_att=57.673, acc=0.729, loss=62.493, grad_norm=6.330, loss_scale=1.000, learning_rate=6.365e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:00:06,434 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 22:00:33,757 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 22:00:51,534 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 22:00:51,534 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 22:00:51,537 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 22:01:14,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:14,254] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:14,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:15,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:17,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:16,203] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:16,461] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:18,105] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:16,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:18,206] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:16,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:18,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:19,538] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:19,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:20,086] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:01:20,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:03,618] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:03,821] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:03,997] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:04,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:05,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:06,217] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:06,421] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:06,575] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:08,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:09,269] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:09,533] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:09,906] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:10,166] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:11,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:11,978] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:14,214] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:52,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:53,188] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:55,234] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:55,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:55,880] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:56,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:58,057] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:56,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:56,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:58,448] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:59,019] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:59,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:02:59,909] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:00,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:00,929] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:05,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:40,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:41,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:44,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:44,662] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:45,072] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:45,319] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:48,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:48,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:50,622] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:50,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:50,712] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:50,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:51,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:51,930] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:52,560] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:03:59,410] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 22:05:01,966 (deepspeed_trainer:228) INFO: 40epoch:train:7501-7600batch: iter_time=2.587, loss_ctc=73.933, loss_att=65.464, acc=0.693, loss=68.005, grad_norm=7.092, loss_scale=1.000, learning_rate=6.364e-05, step_time=0.408 [cnode7-012:0/16] 2024-12-08 22:05:35,424 (deepspeed_trainer:228) INFO: 40epoch:train:7601-7700batch: iter_time=1.059e-04, loss_ctc=67.875, loss_att=56.187, acc=0.728, loss=59.693, grad_norm=5.502, loss_scale=1.000, learning_rate=6.364e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 22:06:08,562 (deepspeed_trainer:228) INFO: 40epoch:train:7701-7800batch: iter_time=1.076e-04, loss_ctc=71.797, loss_att=57.157, acc=0.722, loss=61.566, grad_norm=5.532, loss_scale=1.000, learning_rate=6.363e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:06:41,415 (deepspeed_trainer:228) INFO: 40epoch:train:7801-7900batch: iter_time=1.094e-04, loss_ctc=74.293, loss_att=55.282, acc=0.713, loss=60.993, grad_norm=6.400, loss_scale=1.000, learning_rate=6.363e-05, step_time=0.328 [2024-12-08 22:07:14,929] [INFO] [logging.py:129:log_dist] [Rank 0] step=593000, skipped=0, lr=[np.float64(6.361769182363313e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:07:14,932] [INFO] [timer.py:264:stop] epoch=0/micro_step=203000/global_step=203000, RunningAvgSamplesPerSec=49.20490559664263, CurrSamplesPerSec=51.62588859732531, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:07:14,935 (deepspeed_trainer:228) INFO: 40epoch:train:7901-8000batch: iter_time=1.082e-04, loss_ctc=66.610, loss_att=50.860, acc=0.734, loss=55.554, grad_norm=5.487, loss_scale=1.000, learning_rate=6.362e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 22:07:47,916 (deepspeed_trainer:228) INFO: 40epoch:train:8001-8100batch: iter_time=1.086e-04, loss_ctc=65.559, loss_att=49.262, acc=0.720, loss=54.166, grad_norm=5.926, loss_scale=1.000, learning_rate=6.361e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:08:21,422 (deepspeed_trainer:228) INFO: 40epoch:train:8101-8200batch: iter_time=1.147e-04, loss_ctc=72.721, loss_att=53.650, acc=0.712, loss=59.374, grad_norm=6.419, loss_scale=1.000, learning_rate=6.361e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 22:08:54,439 (deepspeed_trainer:228) INFO: 40epoch:train:8201-8300batch: iter_time=1.080e-04, loss_ctc=70.278, loss_att=55.368, acc=0.713, loss=59.876, grad_norm=5.890, loss_scale=1.000, learning_rate=6.360e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:09:27,466 (deepspeed_trainer:228) INFO: 40epoch:train:8301-8400batch: iter_time=1.067e-04, loss_ctc=62.700, loss_att=47.478, acc=0.730, loss=52.057, grad_norm=5.549, loss_scale=1.000, learning_rate=6.360e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:10:00,394 (deepspeed_trainer:228) INFO: 40epoch:train:8401-8500batch: iter_time=1.069e-04, loss_ctc=76.544, loss_att=50.183, acc=0.738, loss=58.077, grad_norm=6.249, loss_scale=1.000, learning_rate=6.359e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:10:33,368 (deepspeed_trainer:228) INFO: 40epoch:train:8501-8600batch: iter_time=1.137e-04, loss_ctc=72.180, loss_att=51.176, acc=0.732, loss=57.470, grad_norm=6.021, loss_scale=1.000, learning_rate=6.359e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:11:06,562 (deepspeed_trainer:228) INFO: 40epoch:train:8601-8700batch: iter_time=1.104e-04, loss_ctc=65.513, loss_att=49.466, acc=0.740, loss=54.268, grad_norm=5.526, loss_scale=1.000, learning_rate=6.358e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 22:11:39,770 (deepspeed_trainer:228) INFO: 40epoch:train:8701-8800batch: iter_time=1.079e-04, loss_ctc=73.681, loss_att=57.914, acc=0.721, loss=62.657, grad_norm=5.304, loss_scale=1.000, learning_rate=6.358e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:12:13,018 (deepspeed_trainer:228) INFO: 40epoch:train:8801-8900batch: iter_time=1.086e-04, loss_ctc=63.363, loss_att=53.032, acc=0.723, loss=56.129, grad_norm=5.609, loss_scale=1.000, learning_rate=6.357e-05, step_time=0.333 [2024-12-08 22:12:46,187] [INFO] [logging.py:129:log_dist] [Rank 0] step=594000, skipped=0, lr=[np.float64(6.356411911124458e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:12:46,188] [INFO] [timer.py:264:stop] epoch=0/micro_step=204000/global_step=204000, RunningAvgSamplesPerSec=49.207587788248055, CurrSamplesPerSec=52.98245865015692, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:12:46,190 (deepspeed_trainer:228) INFO: 40epoch:train:8901-9000batch: iter_time=1.081e-04, loss_ctc=67.744, loss_att=47.994, acc=0.732, loss=53.929, grad_norm=5.981, loss_scale=1.000, learning_rate=6.357e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:13:19,756 (deepspeed_trainer:228) INFO: 40epoch:train:9001-9100batch: iter_time=1.092e-04, loss_ctc=71.333, loss_att=51.112, acc=0.731, loss=57.200, grad_norm=6.324, loss_scale=1.000, learning_rate=6.356e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 22:13:52,999 (deepspeed_trainer:228) INFO: 40epoch:train:9101-9200batch: iter_time=1.096e-04, loss_ctc=70.141, loss_att=52.327, acc=0.727, loss=57.684, grad_norm=6.244, loss_scale=1.000, learning_rate=6.356e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 22:14:26,059 (deepspeed_trainer:228) INFO: 40epoch:train:9201-9300batch: iter_time=1.159e-04, loss_ctc=58.389, loss_att=44.956, acc=0.734, loss=49.009, grad_norm=5.015, loss_scale=1.000, learning_rate=6.355e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:14:55,237 (multiple_iter_factory:32) INFO: Building 5th iter-factory... [cnode7-012:0/16] 2024-12-08 22:15:21,872 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 22:15:38,517 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.6", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.6", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.6", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.6", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 22:15:38,517 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.6, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 22:15:38,519 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 22:16:03,413] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:03,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:03,835] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:04,112] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:04,684] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:05,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:07,200] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:05,657] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:07,259] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:07,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:08,119] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:08,205] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:08,209] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:06,884] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:08,923] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:11,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:53,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:55,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:54,510] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:55,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:55,002] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:55,309] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:55,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:56,286] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:56,612] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:56,058] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:56,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:56,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:57,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:58,278] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:16:58,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:04,566] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:43,710] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:44,841] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:44,162] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:44,889] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:44,636] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:44,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:45,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:45,608] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:46,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:45,973] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:46,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:47,585] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:47,990] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:49,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:49,475] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:17:56,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:31,323] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:33,068] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:33,641] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:34,491] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:34,531] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:34,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:34,284] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:35,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:34,862] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:35,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:36,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:37,096] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:38,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:38,523] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:40,144] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:18:46,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 22:19:22,835 (deepspeed_trainer:228) INFO: 40epoch:train:9301-9400batch: iter_time=2.547, loss_ctc=81.538, loss_att=69.295, acc=0.713, loss=72.952, grad_norm=7.925, loss_scale=1.000, learning_rate=6.355e-05, step_time=0.429 [cnode7-012:0/16] 2024-12-08 22:19:56,590 (deepspeed_trainer:228) INFO: 40epoch:train:9401-9500batch: iter_time=1.090e-04, loss_ctc=71.478, loss_att=58.507, acc=0.714, loss=62.391, grad_norm=5.985, loss_scale=1.000, learning_rate=6.354e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 22:20:31,133 (deepspeed_trainer:228) INFO: 40epoch:train:9501-9600batch: iter_time=1.149e-04, loss_ctc=69.375, loss_att=57.409, acc=0.728, loss=60.977, grad_norm=5.770, loss_scale=1.000, learning_rate=6.353e-05, step_time=0.345 [cnode7-012:0/16] 2024-12-08 22:21:04,517 (deepspeed_trainer:228) INFO: 40epoch:train:9601-9700batch: iter_time=1.109e-04, loss_ctc=72.182, loss_att=55.066, acc=0.732, loss=60.193, grad_norm=6.315, loss_scale=1.000, learning_rate=6.353e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 22:21:37,728 (deepspeed_trainer:228) INFO: 40epoch:train:9701-9800batch: iter_time=1.160e-04, loss_ctc=66.539, loss_att=51.038, acc=0.727, loss=55.687, grad_norm=5.956, loss_scale=1.000, learning_rate=6.352e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 22:22:11,392 (deepspeed_trainer:228) INFO: 40epoch:train:9801-9900batch: iter_time=1.102e-04, loss_ctc=70.916, loss_att=55.947, acc=0.731, loss=60.459, grad_norm=5.773, loss_scale=1.000, learning_rate=6.352e-05, step_time=0.336 [2024-12-08 22:22:44,370] [INFO] [logging.py:129:log_dist] [Rank 0] step=595000, skipped=0, lr=[np.float64(6.35106815126974e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:22:44,370] [INFO] [timer.py:264:stop] epoch=0/micro_step=205000/global_step=205000, RunningAvgSamplesPerSec=49.200618294924745, CurrSamplesPerSec=50.67783356536407, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:22:44,373 (deepspeed_trainer:228) INFO: 40epoch:train:9901-10000batch: iter_time=1.120e-04, loss_ctc=62.993, loss_att=46.266, acc=0.728, loss=51.291, grad_norm=5.295, loss_scale=1.000, learning_rate=6.351e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:23:18,962 (deepspeed_trainer:228) INFO: 40epoch:train:10001-10100batch: iter_time=1.127e-04, loss_ctc=79.036, loss_att=61.526, acc=0.713, loss=66.779, grad_norm=6.697, loss_scale=1.000, learning_rate=6.351e-05, step_time=0.346 [cnode7-012:0/16] 2024-12-08 22:23:51,782 (deepspeed_trainer:228) INFO: 40epoch:train:10101-10200batch: iter_time=1.141e-04, loss_ctc=64.175, loss_att=51.382, acc=0.723, loss=55.221, grad_norm=5.446, loss_scale=1.000, learning_rate=6.350e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 22:24:24,613 (deepspeed_trainer:228) INFO: 40epoch:train:10201-10300batch: iter_time=1.184e-04, loss_ctc=63.029, loss_att=46.260, acc=0.735, loss=51.284, grad_norm=5.310, loss_scale=1.000, learning_rate=6.350e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 22:24:57,728 (deepspeed_trainer:228) INFO: 40epoch:train:10301-10400batch: iter_time=1.219e-04, loss_ctc=78.823, loss_att=52.151, acc=0.747, loss=60.169, grad_norm=6.658, loss_scale=1.000, learning_rate=6.349e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:25:30,476 (deepspeed_trainer:228) INFO: 40epoch:train:10401-10500batch: iter_time=1.218e-04, loss_ctc=68.679, loss_att=49.216, acc=0.735, loss=55.068, grad_norm=6.124, loss_scale=1.000, learning_rate=6.349e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 22:26:03,826 (deepspeed_trainer:228) INFO: 40epoch:train:10501-10600batch: iter_time=1.199e-04, loss_ctc=74.047, loss_att=59.155, acc=0.737, loss=63.622, grad_norm=6.266, loss_scale=1.000, learning_rate=6.348e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 22:26:36,776 (deepspeed_trainer:228) INFO: 40epoch:train:10601-10700batch: iter_time=1.233e-04, loss_ctc=67.618, loss_att=52.848, acc=0.729, loss=57.283, grad_norm=5.969, loss_scale=1.000, learning_rate=6.348e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:27:09,865 (deepspeed_trainer:228) INFO: 40epoch:train:10701-10800batch: iter_time=1.194e-04, loss_ctc=65.525, loss_att=53.272, acc=0.729, loss=56.930, grad_norm=5.715, loss_scale=1.000, learning_rate=6.347e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:27:42,244 (deepspeed_trainer:228) INFO: 40epoch:train:10801-10900batch: iter_time=1.171e-04, loss_ctc=65.463, loss_att=47.768, acc=0.733, loss=53.085, grad_norm=6.060, loss_scale=1.000, learning_rate=6.347e-05, step_time=0.330 [2024-12-08 22:28:15,341] [INFO] [logging.py:129:log_dist] [Rank 0] step=596000, skipped=0, lr=[np.float64(6.345737846100181e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:28:15,341] [INFO] [timer.py:264:stop] epoch=0/micro_step=206000/global_step=206000, RunningAvgSamplesPerSec=49.20371397499917, CurrSamplesPerSec=52.17107512106324, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:28:15,344 (deepspeed_trainer:228) INFO: 40epoch:train:10901-11000batch: iter_time=1.173e-04, loss_ctc=74.147, loss_att=53.906, acc=0.742, loss=59.962, grad_norm=5.918, loss_scale=1.000, learning_rate=6.346e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:28:48,299 (deepspeed_trainer:228) INFO: 40epoch:train:11001-11100batch: iter_time=1.184e-04, loss_ctc=66.176, loss_att=52.138, acc=0.729, loss=56.363, grad_norm=6.026, loss_scale=1.000, learning_rate=6.345e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:29:21,417 (deepspeed_trainer:228) INFO: 40epoch:train:11101-11200batch: iter_time=1.176e-04, loss_ctc=62.386, loss_att=49.447, acc=0.730, loss=53.325, grad_norm=6.005, loss_scale=1.000, learning_rate=6.345e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:29:42,005 (multiple_iter_factory:32) INFO: Building 6th iter-factory... [cnode7-012:0/16] 2024-12-08 22:30:08,748 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 22:30:25,789 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 22:30:25,790 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 22:30:25,792 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 22:30:52,154] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:52,500] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:52,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:53,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:53,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:53,673] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:53,876] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:54,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:54,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:54,772] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:54,948] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:55,317] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:55,453] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:55,598] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:56,000] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:30:56,140] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:42,576] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:42,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:43,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:43,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:43,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:44,000] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:44,042] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:44,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:44,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:45,031] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:45,246] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:45,456] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:45,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:45,713] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:46,985] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:31:49,791] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:31,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:31,715] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:32,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:32,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:33,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:33,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:34,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:34,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:34,545] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:35,344] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:35,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:36,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:36,913] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:37,494] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:37,843] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:32:41,775] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:20,198] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:21,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:22,118] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:22,346] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:23,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:23,135] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:23,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:23,505] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:24,107] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:24,960] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:25,936] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:26,751] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:28,155] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:28,905] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:30,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:33:33,180] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 22:34:18,961 (deepspeed_trainer:228) INFO: 40epoch:train:11201-11300batch: iter_time=2.558, loss_ctc=80.101, loss_att=66.929, acc=0.713, loss=70.876, grad_norm=6.796, loss_scale=1.000, learning_rate=6.344e-05, step_time=0.407 [cnode7-012:0/16] 2024-12-08 22:34:52,628 (deepspeed_trainer:228) INFO: 40epoch:train:11301-11400batch: iter_time=1.072e-04, loss_ctc=66.959, loss_att=56.010, acc=0.723, loss=59.296, grad_norm=5.608, loss_scale=1.000, learning_rate=6.344e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 22:35:26,261 (deepspeed_trainer:228) INFO: 40epoch:train:11401-11500batch: iter_time=1.119e-04, loss_ctc=68.240, loss_att=54.878, acc=0.740, loss=58.911, grad_norm=5.193, loss_scale=1.000, learning_rate=6.343e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 22:36:00,076 (deepspeed_trainer:228) INFO: 40epoch:train:11501-11600batch: iter_time=1.121e-04, loss_ctc=79.538, loss_att=59.005, acc=0.727, loss=65.163, grad_norm=6.320, loss_scale=1.000, learning_rate=6.343e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 22:36:33,600 (deepspeed_trainer:228) INFO: 40epoch:train:11601-11700batch: iter_time=1.144e-04, loss_ctc=62.869, loss_att=48.737, acc=0.737, loss=52.983, grad_norm=5.423, loss_scale=1.000, learning_rate=6.342e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 22:37:07,089 (deepspeed_trainer:228) INFO: 40epoch:train:11701-11800batch: iter_time=1.098e-04, loss_ctc=66.724, loss_att=50.626, acc=0.731, loss=55.455, grad_norm=6.106, loss_scale=1.000, learning_rate=6.342e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 22:37:40,447 (deepspeed_trainer:228) INFO: 40epoch:train:11801-11900batch: iter_time=1.119e-04, loss_ctc=63.271, loss_att=48.284, acc=0.721, loss=52.756, grad_norm=5.926, loss_scale=1.000, learning_rate=6.341e-05, step_time=0.333 [2024-12-08 22:38:14,369] [INFO] [logging.py:129:log_dist] [Rank 0] step=597000, skipped=0, lr=[np.float64(6.340420939249345e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:38:14,370] [INFO] [timer.py:264:stop] epoch=0/micro_step=207000/global_step=207000, RunningAvgSamplesPerSec=49.19792542562264, CurrSamplesPerSec=47.43292045297327, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:38:14,373 (deepspeed_trainer:228) INFO: 40epoch:train:11901-12000batch: iter_time=1.127e-04, loss_ctc=80.807, loss_att=61.928, acc=0.715, loss=67.561, grad_norm=6.121, loss_scale=1.000, learning_rate=6.341e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 22:38:47,712 (deepspeed_trainer:228) INFO: 40epoch:train:12001-12100batch: iter_time=1.108e-04, loss_ctc=62.318, loss_att=48.471, acc=0.724, loss=52.630, grad_norm=5.938, loss_scale=1.000, learning_rate=6.340e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 22:39:21,188 (deepspeed_trainer:228) INFO: 40epoch:train:12101-12200batch: iter_time=1.103e-04, loss_ctc=65.809, loss_att=46.750, acc=0.746, loss=52.477, grad_norm=5.487, loss_scale=1.000, learning_rate=6.340e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 22:39:55,100 (deepspeed_trainer:228) INFO: 40epoch:train:12201-12300batch: iter_time=1.118e-04, loss_ctc=78.293, loss_att=54.500, acc=0.740, loss=61.638, grad_norm=5.763, loss_scale=1.000, learning_rate=6.339e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 22:40:28,771 (deepspeed_trainer:228) INFO: 40epoch:train:12301-12400batch: iter_time=1.164e-04, loss_ctc=66.312, loss_att=47.312, acc=0.741, loss=53.028, grad_norm=5.623, loss_scale=1.000, learning_rate=6.339e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 22:41:02,742 (deepspeed_trainer:228) INFO: 40epoch:train:12401-12500batch: iter_time=1.136e-04, loss_ctc=73.086, loss_att=57.011, acc=0.737, loss=61.825, grad_norm=5.297, loss_scale=1.000, learning_rate=6.338e-05, step_time=0.339 [cnode7-012:0/16] 2024-12-08 22:41:36,341 (deepspeed_trainer:228) INFO: 40epoch:train:12501-12600batch: iter_time=1.152e-04, loss_ctc=65.675, loss_att=54.257, acc=0.726, loss=57.695, grad_norm=5.440, loss_scale=1.000, learning_rate=6.337e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 22:42:10,154 (deepspeed_trainer:228) INFO: 40epoch:train:12601-12700batch: iter_time=1.154e-04, loss_ctc=70.045, loss_att=52.917, acc=0.737, loss=58.042, grad_norm=6.322, loss_scale=1.000, learning_rate=6.337e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 22:42:43,843 (deepspeed_trainer:228) INFO: 40epoch:train:12701-12800batch: iter_time=1.171e-04, loss_ctc=63.877, loss_att=47.032, acc=0.729, loss=52.078, grad_norm=5.950, loss_scale=1.000, learning_rate=6.336e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 22:43:17,473 (deepspeed_trainer:228) INFO: 40epoch:train:12801-12900batch: iter_time=1.117e-04, loss_ctc=72.766, loss_att=55.762, acc=0.731, loss=60.878, grad_norm=6.667, loss_scale=1.000, learning_rate=6.336e-05, step_time=0.336 [2024-12-08 22:43:50,919] [INFO] [logging.py:129:log_dist] [Rank 0] step=598000, skipped=0, lr=[np.float64(6.335117374680837e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:43:50,920] [INFO] [timer.py:264:stop] epoch=0/micro_step=208000/global_step=208000, RunningAvgSamplesPerSec=49.196925048242704, CurrSamplesPerSec=51.107698171994556, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:43:50,922 (deepspeed_trainer:228) INFO: 40epoch:train:12901-13000batch: iter_time=1.162e-04, loss_ctc=64.502, loss_att=50.332, acc=0.736, loss=54.589, grad_norm=5.226, loss_scale=1.000, learning_rate=6.335e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 22:44:24,387 (deepspeed_trainer:228) INFO: 40epoch:train:13001-13100batch: iter_time=1.172e-04, loss_ctc=67.974, loss_att=52.362, acc=0.731, loss=57.047, grad_norm=6.296, loss_scale=1.000, learning_rate=6.335e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 22:44:36,346 (multiple_iter_factory:32) INFO: Building 7th iter-factory... [cnode7-012:0/16] 2024-12-08 22:45:04,045 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 22:45:21,538 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.2", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.2", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.2", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.2", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 22:45:21,538 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.2, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 22:45:21,541 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 22:45:45,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:45,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:45,897] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:46,402] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:47,025] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:48,181] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:47,425] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:47,737] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:49,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:48,380] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:50,093] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:50,225] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:50,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:50,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:50,864] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:45:51,190] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:34,495] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:34,895] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:34,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:35,900] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:36,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:38,020] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:37,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:38,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:38,377] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:39,488] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:39,807] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:40,047] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:40,255] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:40,601] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:41,050] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:46:41,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:22,514] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:23,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:23,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:24,817] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:25,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:26,487] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:26,718] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:27,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:28,697] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:29,233] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:30,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:30,406] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:30,501] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:30,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:30,043] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:47:31,090] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:11,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:12,218] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:12,898] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:14,611] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:14,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:16,365] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:15,859] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:17,800] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:18,184] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:20,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:20,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:20,504] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:20,577] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:20,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:20,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:48:21,241] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 22:49:13,856 (deepspeed_trainer:228) INFO: 40epoch:train:13101-13200batch: iter_time=2.555, loss_ctc=76.619, loss_att=63.627, acc=0.711, loss=67.523, grad_norm=6.733, loss_scale=1.000, learning_rate=6.334e-05, step_time=0.349 [cnode7-012:0/16] 2024-12-08 22:49:47,363 (deepspeed_trainer:228) INFO: 40epoch:train:13201-13300batch: iter_time=1.100e-04, loss_ctc=72.363, loss_att=61.878, acc=0.726, loss=65.021, grad_norm=6.154, loss_scale=1.000, learning_rate=6.334e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 22:50:20,263 (deepspeed_trainer:228) INFO: 40epoch:train:13301-13400batch: iter_time=1.014e-04, loss_ctc=69.390, loss_att=53.574, acc=0.733, loss=58.323, grad_norm=5.956, loss_scale=1.000, learning_rate=6.333e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:50:53,547 (deepspeed_trainer:228) INFO: 40epoch:train:13401-13500batch: iter_time=1.081e-04, loss_ctc=73.987, loss_att=55.383, acc=0.731, loss=60.963, grad_norm=5.620, loss_scale=1.000, learning_rate=6.333e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 22:51:26,687 (deepspeed_trainer:228) INFO: 40epoch:train:13501-13600batch: iter_time=1.082e-04, loss_ctc=67.314, loss_att=53.583, acc=0.733, loss=57.695, grad_norm=5.981, loss_scale=1.000, learning_rate=6.332e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 22:51:59,703 (deepspeed_trainer:228) INFO: 40epoch:train:13601-13700batch: iter_time=1.093e-04, loss_ctc=63.810, loss_att=47.372, acc=0.730, loss=52.298, grad_norm=5.064, loss_scale=1.000, learning_rate=6.332e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:52:32,732 (deepspeed_trainer:228) INFO: 40epoch:train:13701-13800batch: iter_time=1.082e-04, loss_ctc=66.802, loss_att=50.049, acc=0.723, loss=55.072, grad_norm=6.285, loss_scale=1.000, learning_rate=6.331e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:53:06,017 (deepspeed_trainer:228) INFO: 40epoch:train:13801-13900batch: iter_time=1.089e-04, loss_ctc=75.118, loss_att=58.656, acc=0.715, loss=63.584, grad_norm=6.768, loss_scale=1.000, learning_rate=6.331e-05, step_time=0.332 [2024-12-08 22:53:38,918] [INFO] [logging.py:129:log_dist] [Rank 0] step=599000, skipped=0, lr=[np.float64(6.329827096685826e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:53:38,919] [INFO] [timer.py:264:stop] epoch=0/micro_step=209000/global_step=209000, RunningAvgSamplesPerSec=49.19804698198592, CurrSamplesPerSec=49.87488652107256, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:53:38,922 (deepspeed_trainer:228) INFO: 40epoch:train:13901-14000batch: iter_time=1.091e-04, loss_ctc=62.180, loss_att=45.803, acc=0.730, loss=50.722, grad_norm=5.368, loss_scale=1.000, learning_rate=6.330e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:54:11,939 (deepspeed_trainer:228) INFO: 40epoch:train:14001-14100batch: iter_time=1.074e-04, loss_ctc=75.168, loss_att=51.195, acc=0.745, loss=58.400, grad_norm=5.258, loss_scale=1.000, learning_rate=6.330e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 22:54:44,889 (deepspeed_trainer:228) INFO: 40epoch:train:14101-14200batch: iter_time=1.069e-04, loss_ctc=71.853, loss_att=53.083, acc=0.740, loss=58.711, grad_norm=5.679, loss_scale=1.000, learning_rate=6.329e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:55:17,735 (deepspeed_trainer:228) INFO: 40epoch:train:14201-14300batch: iter_time=1.087e-04, loss_ctc=64.124, loss_att=47.938, acc=0.742, loss=52.794, grad_norm=5.372, loss_scale=1.000, learning_rate=6.329e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 22:55:50,912 (deepspeed_trainer:228) INFO: 40epoch:train:14301-14400batch: iter_time=1.099e-04, loss_ctc=75.171, loss_att=59.990, acc=0.729, loss=64.505, grad_norm=5.663, loss_scale=1.000, learning_rate=6.328e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 22:56:23,838 (deepspeed_trainer:228) INFO: 40epoch:train:14401-14500batch: iter_time=1.079e-04, loss_ctc=64.817, loss_att=51.510, acc=0.735, loss=55.501, grad_norm=5.261, loss_scale=1.000, learning_rate=6.327e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 22:56:56,648 (deepspeed_trainer:228) INFO: 40epoch:train:14501-14600batch: iter_time=1.076e-04, loss_ctc=66.662, loss_att=50.436, acc=0.733, loss=55.312, grad_norm=5.339, loss_scale=1.000, learning_rate=6.327e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 22:57:30,383 (deepspeed_trainer:228) INFO: 40epoch:train:14601-14700batch: iter_time=1.084e-04, loss_ctc=69.718, loss_att=51.139, acc=0.736, loss=56.713, grad_norm=5.451, loss_scale=1.000, learning_rate=6.326e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 22:58:03,887 (deepspeed_trainer:228) INFO: 40epoch:train:14701-14800batch: iter_time=1.081e-04, loss_ctc=69.627, loss_att=50.127, acc=0.733, loss=55.952, grad_norm=6.030, loss_scale=1.000, learning_rate=6.326e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 22:58:37,446 (deepspeed_trainer:228) INFO: 40epoch:train:14801-14900batch: iter_time=1.094e-04, loss_ctc=62.306, loss_att=49.270, acc=0.737, loss=53.193, grad_norm=5.346, loss_scale=1.000, learning_rate=6.325e-05, step_time=0.335 [2024-12-08 22:59:10,648] [INFO] [logging.py:129:log_dist] [Rank 0] step=600000, skipped=0, lr=[np.float64(6.324550049880579e-05)], mom=[[0.9, 0.98]] [2024-12-08 22:59:10,649] [INFO] [timer.py:264:stop] epoch=0/micro_step=210000/global_step=210000, RunningAvgSamplesPerSec=49.20009604821355, CurrSamplesPerSec=47.981781252644, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 22:59:10,651 (deepspeed_trainer:228) INFO: 40epoch:train:14901-15000batch: iter_time=1.073e-04, loss_ctc=72.818, loss_att=57.369, acc=0.731, loss=62.013, grad_norm=6.420, loss_scale=1.000, learning_rate=6.325e-05, step_time=0.331 [2024-12-08 22:59:25,983] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:26,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:27,060] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:27,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:27,458] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:27,542] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:27,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:27,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,502] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,518] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,606] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,660] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,731] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:28,756] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:41,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:42,653] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:42,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:43,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:43,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:44,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:44,430] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:44,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:44,770] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:44,825] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:45,307] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:45,533] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:45,588] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:46,171] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:46,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:46,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:57,477] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:58,039] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:58,408] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:58,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:59,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 22:59:59,951] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:00,578] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:00,469] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:00,516] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:01,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:01,779] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:02,048] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:02,208] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:02,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:03,004] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:03,110] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:13,717] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:13,767] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:14,353] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:14,389] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:14,685] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:15,299] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:15,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:16,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:16,176] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:16,445] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:16,740] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:18,069] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:18,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:18,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:18,604] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:18,741] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:00:31,477] [INFO] [logging.py:129:log_dist] [Rank 0] [Torch] Checkpoint 40 is about to be saved! [2024-12-08 23:00:31,508] [INFO] [logging.py:129:log_dist] [Rank 0] Saving model checkpoint: exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/mp_rank_00_model_states.pt [2024-12-08 23:00:31,508] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/mp_rank_00_model_states.pt... [2024-12-08 23:00:33,600] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/mp_rank_00_model_states.pt. [2024-12-08 23:00:33,756] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,758] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,555] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,556] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,759] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,759] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,557] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,557] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,558] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,559] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,763] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,560] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,764] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,764] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,561] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... [2024-12-08 23:00:33,765] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... [2024-12-08 23:00:34,395] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,395] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,395] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,231] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,231] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,231] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,517] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,517] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,517] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,323] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,324] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,324] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,357] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,358] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,358] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,363] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,363] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,363] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,375] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,375] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,376] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,377] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,377] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,580] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,377] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,581] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,581] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,381] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,381] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,381] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,588] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,588] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,588] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,409] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,410] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,410] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,622] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,622] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,622] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,624] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,624] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,624] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,641] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,641] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,641] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [2024-12-08 23:00:34,644] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. [2024-12-08 23:00:34,653] [INFO] [engine.py:3536:_save_zero_checkpoint] zero checkpoint saved exp_owsm/s2t_train_05b_ds_raw_bpe50000/checkpoint_40/40/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt [2024-12-08 23:00:34,653] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint 40 is ready now! [cnode7-012:0/16] 2024-12-08 23:00:34,674 (deepspeed_trainer:158) INFO: 40epoch results: [train] iter_time=0.135, loss_ctc=69.885, loss_att=53.386, acc=0.726, loss=58.336, grad_norm=5.883, loss_scale=1.000, learning_rate=6.365e-05, step_time=0.336, time=1 hour, 58 minutes and 3.16 seconds, total_count=600040, gpu_max_cached_mem_GB=44.875, [valid] loss_ctc=3.844, cer_ctc=0.091, loss_att=5.344, acc=0.920, cer=0.082, wer=0.812, loss=4.906, time=1 minute and 11.22 seconds, total_count=40, gpu_max_cached_mem_GB=44.875 [cnode7-012:0/16] 2024-12-08 23:00:36,280 (multiple_iter_factory:32) INFO: Building 0th iter-factory... [cnode7-012:0/16] 2024-12-08 23:01:03,498 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 23:01:19,646 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.3", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.3", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.3", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.3", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 23:01:19,646 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.3, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 23:01:19,649 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 23:01:42,099] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:42,318] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:42,511] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:42,675] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:43,092] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:43,438] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:43,232] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:43,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:43,564] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:43,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:44,396] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:44,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:44,484] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:44,296] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:44,993] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:01:45,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:30,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:30,556] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:30,986] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:32,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:32,314] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:32,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:32,708] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:33,063] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:33,182] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:33,173] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:33,628] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:33,886] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:33,939] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:34,321] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:34,464] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:02:34,879] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:18,724] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:20,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:21,125] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:21,373] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:21,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:22,238] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:22,842] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:23,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:23,557] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:24,147] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:24,393] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:24,639] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:24,508] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:25,794] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:26,763] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:03:28,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:06,788] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:08,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:10,432] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:10,599] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:10,937] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:11,053] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:11,285] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:11,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:11,563] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:13,249] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:13,558] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:13,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:14,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:15,114] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:16,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:04:19,702] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 23:05:19,591 (deepspeed_trainer:228) INFO: 41epoch:train:1-100batch: iter_time=2.447, loss_ctc=70.489, loss_att=55.195, acc=0.719, loss=59.800, grad_norm=6.566, loss_scale=1.000, learning_rate=6.324e-05, step_time=0.386 [cnode7-012:0/16] 2024-12-08 23:05:53,031 (deepspeed_trainer:228) INFO: 41epoch:train:101-200batch: iter_time=1.193e-04, loss_ctc=86.748, loss_att=61.761, acc=0.714, loss=69.257, grad_norm=8.177, loss_scale=1.000, learning_rate=6.324e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 23:06:26,248 (deepspeed_trainer:228) INFO: 41epoch:train:201-300batch: iter_time=1.208e-04, loss_ctc=72.603, loss_att=53.734, acc=0.715, loss=59.413, grad_norm=5.988, loss_scale=1.000, learning_rate=6.323e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:06:59,584 (deepspeed_trainer:228) INFO: 41epoch:train:301-400batch: iter_time=1.199e-04, loss_ctc=67.629, loss_att=51.562, acc=0.720, loss=56.367, grad_norm=5.935, loss_scale=1.000, learning_rate=6.323e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 23:07:32,737 (deepspeed_trainer:228) INFO: 41epoch:train:401-500batch: iter_time=1.232e-04, loss_ctc=69.061, loss_att=52.028, acc=0.725, loss=57.140, grad_norm=6.004, loss_scale=1.000, learning_rate=6.322e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:08:05,829 (deepspeed_trainer:228) INFO: 41epoch:train:501-600batch: iter_time=1.241e-04, loss_ctc=64.941, loss_att=47.092, acc=0.722, loss=52.463, grad_norm=6.427, loss_scale=1.000, learning_rate=6.322e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:08:39,124 (deepspeed_trainer:228) INFO: 41epoch:train:601-700batch: iter_time=1.215e-04, loss_ctc=61.889, loss_att=44.860, acc=0.732, loss=49.971, grad_norm=5.541, loss_scale=1.000, learning_rate=6.321e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:09:12,238 (deepspeed_trainer:228) INFO: 41epoch:train:701-800batch: iter_time=1.248e-04, loss_ctc=69.547, loss_att=51.697, acc=0.718, loss=57.042, grad_norm=5.557, loss_scale=1.000, learning_rate=6.321e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:09:45,454 (deepspeed_trainer:228) INFO: 41epoch:train:801-900batch: iter_time=1.226e-04, loss_ctc=74.843, loss_att=51.102, acc=0.714, loss=58.191, grad_norm=5.889, loss_scale=1.000, learning_rate=6.320e-05, step_time=0.332 [2024-12-08 23:10:18,664] [INFO] [logging.py:129:log_dist] [Rank 0] step=601000, skipped=0, lr=[np.float64(6.319286179204039e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:10:18,665] [INFO] [timer.py:264:stop] epoch=0/micro_step=211000/global_step=211000, RunningAvgSamplesPerSec=49.19825382419495, CurrSamplesPerSec=53.39771002949723, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:10:18,668 (deepspeed_trainer:228) INFO: 41epoch:train:901-1000batch: iter_time=1.312e-04, loss_ctc=78.724, loss_att=58.787, acc=0.708, loss=64.734, grad_norm=6.879, loss_scale=1.000, learning_rate=6.320e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:10:51,836 (deepspeed_trainer:228) INFO: 41epoch:train:1001-1100batch: iter_time=1.296e-04, loss_ctc=62.267, loss_att=49.650, acc=0.736, loss=53.443, grad_norm=6.063, loss_scale=1.000, learning_rate=6.319e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:11:24,948 (deepspeed_trainer:228) INFO: 41epoch:train:1101-1200batch: iter_time=1.159e-04, loss_ctc=70.898, loss_att=52.926, acc=0.714, loss=58.318, grad_norm=6.345, loss_scale=1.000, learning_rate=6.318e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:11:57,771 (deepspeed_trainer:228) INFO: 41epoch:train:1201-1300batch: iter_time=1.157e-04, loss_ctc=62.833, loss_att=46.194, acc=0.724, loss=51.165, grad_norm=5.639, loss_scale=1.000, learning_rate=6.318e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:12:30,875 (deepspeed_trainer:228) INFO: 41epoch:train:1301-1400batch: iter_time=1.150e-04, loss_ctc=74.738, loss_att=54.303, acc=0.711, loss=60.422, grad_norm=6.333, loss_scale=1.000, learning_rate=6.317e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:13:03,820 (deepspeed_trainer:228) INFO: 41epoch:train:1401-1500batch: iter_time=1.156e-04, loss_ctc=66.671, loss_att=47.371, acc=0.731, loss=53.178, grad_norm=5.801, loss_scale=1.000, learning_rate=6.317e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:13:36,810 (deepspeed_trainer:228) INFO: 41epoch:train:1501-1600batch: iter_time=1.134e-04, loss_ctc=70.334, loss_att=47.835, acc=0.723, loss=54.609, grad_norm=5.886, loss_scale=1.000, learning_rate=6.316e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 23:14:09,718 (deepspeed_trainer:228) INFO: 41epoch:train:1601-1700batch: iter_time=1.158e-04, loss_ctc=56.682, loss_att=48.272, acc=0.710, loss=50.789, grad_norm=5.442, loss_scale=1.000, learning_rate=6.316e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:14:42,940 (deepspeed_trainer:228) INFO: 41epoch:train:1701-1800batch: iter_time=1.176e-04, loss_ctc=72.746, loss_att=56.392, acc=0.712, loss=61.282, grad_norm=6.883, loss_scale=1.000, learning_rate=6.315e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:15:12,636 (multiple_iter_factory:32) INFO: Building 1th iter-factory... [cnode7-012:0/16] 2024-12-08 23:15:39,619 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 23:15:55,611 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.7", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.7", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.7", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.7", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 23:15:55,611 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.7, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 23:15:55,613 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 23:16:23,409] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:23,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:23,574] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:23,619] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:24,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:24,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:24,527] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:24,870] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:25,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:25,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:25,782] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:25,546] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:26,631] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:26,721] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:26,765] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:16:26,808] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:11,006] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:11,236] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:11,738] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:11,963] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:12,663] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:13,222] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:13,258] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:15,809] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:15,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:13,700] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:16,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:16,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:16,688] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:17,122] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:17,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:17:17,828] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:01,055] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:01,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:01,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:01,786] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:01,935] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:02,264] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:04,642] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:02,621] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:02,971] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:06,632] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:06,689] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:06,727] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:07,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:07,744] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:07,875] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:08,186] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:49,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:49,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:49,664] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:49,723] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:50,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:50,705] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:50,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:51,103] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:55,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:56,074] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:56,301] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:57,256] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:57,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:58,322] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:18:58,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:19:01,460] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 23:19:37,181 (deepspeed_trainer:228) INFO: 41epoch:train:1801-1900batch: iter_time=2.550, loss_ctc=69.472, loss_att=49.501, acc=0.726, loss=55.448, grad_norm=5.835, loss_scale=1.000, learning_rate=6.315e-05, step_time=0.392 [2024-12-08 23:20:10,683] [INFO] [logging.py:129:log_dist] [Rank 0] step=602000, skipped=0, lr=[np.float64(6.314035429915395e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:20:10,684] [INFO] [timer.py:264:stop] epoch=0/micro_step=212000/global_step=212000, RunningAvgSamplesPerSec=49.1968800951166, CurrSamplesPerSec=44.51753955306839, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:20:10,687 (deepspeed_trainer:228) INFO: 41epoch:train:1901-2000batch: iter_time=1.067e-04, loss_ctc=73.109, loss_att=57.292, acc=0.721, loss=62.047, grad_norm=6.261, loss_scale=1.000, learning_rate=6.314e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 23:20:44,215 (deepspeed_trainer:228) INFO: 41epoch:train:2001-2100batch: iter_time=1.095e-04, loss_ctc=81.883, loss_att=56.625, acc=0.722, loss=64.207, grad_norm=6.781, loss_scale=1.000, learning_rate=6.314e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 23:21:17,632 (deepspeed_trainer:228) INFO: 41epoch:train:2101-2200batch: iter_time=1.168e-04, loss_ctc=69.746, loss_att=56.199, acc=0.717, loss=60.267, grad_norm=6.266, loss_scale=1.000, learning_rate=6.313e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 23:21:50,779 (deepspeed_trainer:228) INFO: 41epoch:train:2201-2300batch: iter_time=1.099e-04, loss_ctc=68.812, loss_att=50.094, acc=0.721, loss=55.729, grad_norm=5.527, loss_scale=1.000, learning_rate=6.313e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:22:23,737 (deepspeed_trainer:228) INFO: 41epoch:train:2301-2400batch: iter_time=1.114e-04, loss_ctc=67.266, loss_att=49.096, acc=0.731, loss=54.547, grad_norm=5.573, loss_scale=1.000, learning_rate=6.312e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:22:56,468 (deepspeed_trainer:228) INFO: 41epoch:train:2401-2500batch: iter_time=1.092e-04, loss_ctc=64.892, loss_att=46.305, acc=0.729, loss=51.880, grad_norm=5.513, loss_scale=1.000, learning_rate=6.312e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 23:23:29,336 (deepspeed_trainer:228) INFO: 41epoch:train:2501-2600batch: iter_time=1.104e-04, loss_ctc=61.908, loss_att=43.301, acc=0.739, loss=48.852, grad_norm=5.699, loss_scale=1.000, learning_rate=6.311e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:24:02,141 (deepspeed_trainer:228) INFO: 41epoch:train:2601-2700batch: iter_time=1.074e-04, loss_ctc=74.323, loss_att=52.177, acc=0.715, loss=58.849, grad_norm=5.978, loss_scale=1.000, learning_rate=6.311e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:24:34,988 (deepspeed_trainer:228) INFO: 41epoch:train:2701-2800batch: iter_time=1.092e-04, loss_ctc=70.885, loss_att=51.470, acc=0.712, loss=57.312, grad_norm=6.387, loss_scale=1.000, learning_rate=6.310e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:25:07,846 (deepspeed_trainer:228) INFO: 41epoch:train:2801-2900batch: iter_time=1.080e-04, loss_ctc=73.482, loss_att=53.983, acc=0.724, loss=59.813, grad_norm=6.113, loss_scale=1.000, learning_rate=6.310e-05, step_time=0.328 [2024-12-08 23:25:40,771] [INFO] [logging.py:129:log_dist] [Rank 0] step=603000, skipped=0, lr=[np.float64(6.308797747591709e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:25:40,771] [INFO] [timer.py:264:stop] epoch=0/micro_step=213000/global_step=213000, RunningAvgSamplesPerSec=49.20020897167418, CurrSamplesPerSec=51.1214023141815, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:25:40,774 (deepspeed_trainer:228) INFO: 41epoch:train:2901-3000batch: iter_time=1.077e-04, loss_ctc=62.161, loss_att=52.106, acc=0.725, loss=55.109, grad_norm=5.503, loss_scale=1.000, learning_rate=6.309e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:26:13,658 (deepspeed_trainer:228) INFO: 41epoch:train:3001-3100batch: iter_time=1.102e-04, loss_ctc=69.670, loss_att=50.786, acc=0.720, loss=56.471, grad_norm=6.558, loss_scale=1.000, learning_rate=6.309e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:26:46,470 (deepspeed_trainer:228) INFO: 41epoch:train:3101-3200batch: iter_time=1.066e-04, loss_ctc=66.896, loss_att=47.025, acc=0.732, loss=52.972, grad_norm=5.660, loss_scale=1.000, learning_rate=6.308e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:27:19,227 (deepspeed_trainer:228) INFO: 41epoch:train:3201-3300batch: iter_time=1.083e-04, loss_ctc=69.573, loss_att=51.389, acc=0.721, loss=56.867, grad_norm=6.444, loss_scale=1.000, learning_rate=6.307e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 23:27:51,973 (deepspeed_trainer:228) INFO: 41epoch:train:3301-3400batch: iter_time=1.104e-04, loss_ctc=72.905, loss_att=52.637, acc=0.722, loss=58.701, grad_norm=5.615, loss_scale=1.000, learning_rate=6.307e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 23:28:24,494 (deepspeed_trainer:228) INFO: 41epoch:train:3401-3500batch: iter_time=1.104e-04, loss_ctc=62.764, loss_att=45.327, acc=0.721, loss=50.553, grad_norm=5.568, loss_scale=1.000, learning_rate=6.306e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 23:28:56,842 (deepspeed_trainer:228) INFO: 41epoch:train:3501-3600batch: iter_time=1.059e-04, loss_ctc=60.880, loss_att=49.487, acc=0.715, loss=52.924, grad_norm=6.531, loss_scale=1.000, learning_rate=6.306e-05, step_time=0.323 [cnode7-012:0/16] 2024-12-08 23:29:29,541 (deepspeed_trainer:228) INFO: 41epoch:train:3601-3700batch: iter_time=1.089e-04, loss_ctc=66.564, loss_att=49.963, acc=0.726, loss=54.958, grad_norm=6.402, loss_scale=1.000, learning_rate=6.305e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 23:29:50,201 (multiple_iter_factory:32) INFO: Building 2th iter-factory... [cnode7-012:0/16] 2024-12-08 23:30:16,998 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 23:30:33,608 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.4", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.4", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.4", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.4", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 23:30:33,609 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.4, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 23:30:33,611 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 23:30:56,544] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:30:56,912] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:30:57,172] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:30:58,263] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:01,496] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:30:59,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:02,366] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:02,418] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:00,468] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:00,535] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:00,552] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:03,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:03,774] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:04,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:04,235] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:04,274] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:45,106] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:45,330] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:46,030] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:47,386] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:50,276] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:51,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:51,582] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:51,855] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:49,605] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:49,654] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:52,969] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:50,643] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:50,977] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:53,579] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:53,784] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:31:55,562] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:33,981] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:34,519] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:35,123] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:36,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:40,748] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:38,481] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:38,725] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:39,029] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:41,615] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:39,268] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:42,088] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:42,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:42,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:43,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:46,547] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:32:47,766] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:21,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:21,709] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:22,201] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:23,651] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:25,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:26,734] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:26,753] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:27,051] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:29,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:31,571] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:31,851] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:33,570] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:34,024] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:35,216] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:37,332] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:33:39,302] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 23:34:23,523 (deepspeed_trainer:228) INFO: 41epoch:train:3701-3800batch: iter_time=2.557, loss_ctc=71.629, loss_att=53.290, acc=0.727, loss=58.803, grad_norm=6.502, loss_scale=1.000, learning_rate=6.305e-05, step_time=0.382 [cnode7-012:0/16] 2024-12-08 23:34:57,704 (deepspeed_trainer:228) INFO: 41epoch:train:3801-3900batch: iter_time=1.090e-04, loss_ctc=73.594, loss_att=59.742, acc=0.718, loss=63.907, grad_norm=6.342, loss_scale=1.000, learning_rate=6.304e-05, step_time=0.341 [2024-12-08 23:35:31,390] [INFO] [logging.py:129:log_dist] [Rank 0] step=604000, skipped=0, lr=[np.float64(6.303573078125529e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:35:31,391] [INFO] [timer.py:264:stop] epoch=0/micro_step=214000/global_step=214000, RunningAvgSamplesPerSec=49.200154921948545, CurrSamplesPerSec=50.758020666236725, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:35:31,394 (deepspeed_trainer:228) INFO: 41epoch:train:3901-4000batch: iter_time=1.114e-04, loss_ctc=77.957, loss_att=54.484, acc=0.726, loss=61.536, grad_norm=7.047, loss_scale=1.000, learning_rate=6.304e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 23:36:05,114 (deepspeed_trainer:228) INFO: 41epoch:train:4001-4100batch: iter_time=1.122e-04, loss_ctc=73.849, loss_att=58.722, acc=0.720, loss=63.276, grad_norm=6.310, loss_scale=1.000, learning_rate=6.303e-05, step_time=0.337 [cnode7-012:0/16] 2024-12-08 23:36:38,280 (deepspeed_trainer:228) INFO: 41epoch:train:4101-4200batch: iter_time=1.064e-04, loss_ctc=61.847, loss_att=44.860, acc=0.734, loss=49.959, grad_norm=5.423, loss_scale=1.000, learning_rate=6.303e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:37:11,498 (deepspeed_trainer:228) INFO: 41epoch:train:4201-4300batch: iter_time=1.062e-04, loss_ctc=69.961, loss_att=50.601, acc=0.730, loss=56.400, grad_norm=6.221, loss_scale=1.000, learning_rate=6.302e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:37:45,140 (deepspeed_trainer:228) INFO: 41epoch:train:4301-4400batch: iter_time=1.084e-04, loss_ctc=62.416, loss_att=42.589, acc=0.747, loss=48.541, grad_norm=5.468, loss_scale=1.000, learning_rate=6.302e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 23:38:18,325 (deepspeed_trainer:228) INFO: 41epoch:train:4401-4500batch: iter_time=1.068e-04, loss_ctc=63.640, loss_att=47.921, acc=0.732, loss=52.625, grad_norm=5.579, loss_scale=1.000, learning_rate=6.301e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:38:51,068 (deepspeed_trainer:228) INFO: 41epoch:train:4501-4600batch: iter_time=1.081e-04, loss_ctc=74.107, loss_att=50.602, acc=0.734, loss=57.668, grad_norm=5.763, loss_scale=1.000, learning_rate=6.301e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 23:39:24,173 (deepspeed_trainer:228) INFO: 41epoch:train:4601-4700batch: iter_time=1.086e-04, loss_ctc=70.885, loss_att=53.997, acc=0.710, loss=59.074, grad_norm=6.549, loss_scale=1.000, learning_rate=6.300e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:39:57,488 (deepspeed_trainer:228) INFO: 41epoch:train:4701-4800batch: iter_time=1.071e-04, loss_ctc=72.727, loss_att=54.200, acc=0.734, loss=59.767, grad_norm=5.488, loss_scale=1.000, learning_rate=6.300e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-08 23:40:30,547 (deepspeed_trainer:228) INFO: 41epoch:train:4801-4900batch: iter_time=1.063e-04, loss_ctc=65.197, loss_att=51.587, acc=0.730, loss=55.713, grad_norm=5.691, loss_scale=1.000, learning_rate=6.299e-05, step_time=0.330 [2024-12-08 23:41:03,499] [INFO] [logging.py:129:log_dist] [Rank 0] step=605000, skipped=0, lr=[np.float64(6.298361367722558e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:41:03,500] [INFO] [timer.py:264:stop] epoch=0/micro_step=215000/global_step=215000, RunningAvgSamplesPerSec=49.202264169186385, CurrSamplesPerSec=52.79044779635238, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:41:03,502 (deepspeed_trainer:228) INFO: 41epoch:train:4901-5000batch: iter_time=1.075e-04, loss_ctc=66.700, loss_att=47.743, acc=0.728, loss=53.420, grad_norm=6.471, loss_scale=1.000, learning_rate=6.299e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:41:37,378 (deepspeed_trainer:228) INFO: 41epoch:train:5001-5100batch: iter_time=1.125e-04, loss_ctc=69.615, loss_att=52.093, acc=0.729, loss=57.345, grad_norm=5.880, loss_scale=1.000, learning_rate=6.298e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 23:42:10,218 (deepspeed_trainer:228) INFO: 41epoch:train:5101-5200batch: iter_time=1.080e-04, loss_ctc=65.872, loss_att=48.038, acc=0.729, loss=53.377, grad_norm=5.902, loss_scale=1.000, learning_rate=6.298e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:42:43,185 (deepspeed_trainer:228) INFO: 41epoch:train:5201-5300batch: iter_time=1.094e-04, loss_ctc=72.594, loss_att=53.255, acc=0.731, loss=59.048, grad_norm=5.677, loss_scale=1.000, learning_rate=6.297e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:43:15,771 (deepspeed_trainer:228) INFO: 41epoch:train:5301-5400batch: iter_time=1.072e-04, loss_ctc=60.953, loss_att=47.980, acc=0.719, loss=51.902, grad_norm=5.501, loss_scale=1.000, learning_rate=6.297e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 23:43:48,375 (deepspeed_trainer:228) INFO: 41epoch:train:5401-5500batch: iter_time=1.052e-04, loss_ctc=60.637, loss_att=48.554, acc=0.722, loss=52.143, grad_norm=6.376, loss_scale=1.000, learning_rate=6.296e-05, step_time=0.325 [cnode7-012:0/16] 2024-12-08 23:44:21,310 (deepspeed_trainer:228) INFO: 41epoch:train:5501-5600batch: iter_time=1.086e-04, loss_ctc=71.661, loss_att=51.830, acc=0.733, loss=57.796, grad_norm=5.862, loss_scale=1.000, learning_rate=6.295e-05, step_time=0.330 [cnode7-012:0/16] 2024-12-08 23:44:33,216 (multiple_iter_factory:32) INFO: Building 3th iter-factory... [cnode7-012:0/16] 2024-12-08 23:45:00,131 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 23:45:16,812 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.5", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.5", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.5", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.5", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 23:45:16,812 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.5, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 23:45:16,814 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-08 23:45:40,129] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:41,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:44,056] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:41,711] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:41,833] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:42,292] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:42,335] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:44,844] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:42,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:45,226] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:43,134] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:46,169] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:46,412] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:46,863] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:47,066] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:45:47,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:30,109] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:30,465] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:30,873] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:31,331] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:34,160] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:32,221] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:32,674] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:32,729] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:35,223] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:36,374] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:37,150] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:34,696] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:37,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:37,228] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:37,595] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:46:38,303] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:18,244] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:18,281] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:19,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:20,248] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:20,434] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:23,644] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:21,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:21,550] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:24,539] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:25,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:25,910] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:26,423] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:26,470] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:25,306] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:30,326] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:47:30,665] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:05,656] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:05,931] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:07,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:08,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:08,443] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:08,694] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:09,938] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:13,471] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:15,161] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:15,420] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:16,070] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:14,649] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:17,194] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:17,231] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:22,247] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-08 23:48:22,372] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-08 23:49:14,488 (deepspeed_trainer:228) INFO: 41epoch:train:5601-5700batch: iter_time=2.503, loss_ctc=71.797, loss_att=51.487, acc=0.728, loss=57.561, grad_norm=6.155, loss_scale=1.000, learning_rate=6.295e-05, step_time=0.428 [cnode7-012:0/16] 2024-12-08 23:49:48,306 (deepspeed_trainer:228) INFO: 41epoch:train:5701-5800batch: iter_time=1.084e-04, loss_ctc=83.139, loss_att=65.162, acc=0.704, loss=70.507, grad_norm=8.274, loss_scale=1.000, learning_rate=6.294e-05, step_time=0.338 [cnode7-012:0/16] 2024-12-08 23:50:22,070 (deepspeed_trainer:228) INFO: 41epoch:train:5801-5900batch: iter_time=1.095e-04, loss_ctc=65.201, loss_att=48.113, acc=0.731, loss=53.244, grad_norm=5.806, loss_scale=1.000, learning_rate=6.294e-05, step_time=0.337 [2024-12-08 23:50:55,679] [INFO] [logging.py:129:log_dist] [Rank 0] step=606000, skipped=0, lr=[np.float64(6.293162562899316e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:50:55,680] [INFO] [timer.py:264:stop] epoch=0/micro_step=216000/global_step=216000, RunningAvgSamplesPerSec=49.19781167076403, CurrSamplesPerSec=52.45198565019328, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:50:55,683 (deepspeed_trainer:228) INFO: 41epoch:train:5901-6000batch: iter_time=1.108e-04, loss_ctc=71.385, loss_att=54.893, acc=0.721, loss=59.834, grad_norm=5.965, loss_scale=1.000, learning_rate=6.293e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-08 23:51:29,182 (deepspeed_trainer:228) INFO: 41epoch:train:6001-6100batch: iter_time=1.068e-04, loss_ctc=62.774, loss_att=46.648, acc=0.731, loss=51.523, grad_norm=5.517, loss_scale=1.000, learning_rate=6.293e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 23:52:02,772 (deepspeed_trainer:228) INFO: 41epoch:train:6101-6200batch: iter_time=1.114e-04, loss_ctc=69.954, loss_att=50.563, acc=0.724, loss=56.378, grad_norm=5.880, loss_scale=1.000, learning_rate=6.292e-05, step_time=0.335 [cnode7-012:0/16] 2024-12-08 23:52:36,244 (deepspeed_trainer:228) INFO: 41epoch:train:6201-6300batch: iter_time=1.111e-04, loss_ctc=60.878, loss_att=40.763, acc=0.750, loss=46.806, grad_norm=5.802, loss_scale=1.000, learning_rate=6.292e-05, step_time=0.334 [cnode7-012:0/16] 2024-12-08 23:53:09,516 (deepspeed_trainer:228) INFO: 41epoch:train:6301-6400batch: iter_time=1.077e-04, loss_ctc=70.212, loss_att=53.093, acc=0.720, loss=58.218, grad_norm=5.704, loss_scale=1.000, learning_rate=6.291e-05, step_time=0.332 [cnode7-012:0/16] 2024-12-08 23:53:42,618 (deepspeed_trainer:228) INFO: 41epoch:train:6401-6500batch: iter_time=1.144e-04, loss_ctc=70.903, loss_att=51.443, acc=0.721, loss=57.269, grad_norm=5.870, loss_scale=1.000, learning_rate=6.291e-05, step_time=0.331 [cnode7-012:0/16] 2024-12-08 23:54:15,579 (deepspeed_trainer:228) INFO: 41epoch:train:6501-6600batch: iter_time=1.145e-04, loss_ctc=70.821, loss_att=51.018, acc=0.715, loss=56.945, grad_norm=7.226, loss_scale=1.000, learning_rate=6.290e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:54:48,438 (deepspeed_trainer:228) INFO: 41epoch:train:6601-6700batch: iter_time=1.099e-04, loss_ctc=68.123, loss_att=54.449, acc=0.728, loss=58.548, grad_norm=5.375, loss_scale=1.000, learning_rate=6.290e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:55:21,266 (deepspeed_trainer:228) INFO: 41epoch:train:6701-6800batch: iter_time=1.122e-04, loss_ctc=68.032, loss_att=52.303, acc=0.715, loss=57.017, grad_norm=6.023, loss_scale=1.000, learning_rate=6.289e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:55:54,012 (deepspeed_trainer:228) INFO: 41epoch:train:6801-6900batch: iter_time=1.104e-04, loss_ctc=64.317, loss_att=46.008, acc=0.732, loss=51.531, grad_norm=5.625, loss_scale=1.000, learning_rate=6.289e-05, step_time=0.327 [2024-12-08 23:56:26,947] [INFO] [logging.py:129:log_dist] [Rank 0] step=607000, skipped=0, lr=[np.float64(6.28797661048084e-05)], mom=[[0.9, 0.98]] [2024-12-08 23:56:26,948] [INFO] [timer.py:264:stop] epoch=0/micro_step=217000/global_step=217000, RunningAvgSamplesPerSec=49.2001596041015, CurrSamplesPerSec=51.50559463634015, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-08 23:56:26,950 (deepspeed_trainer:228) INFO: 41epoch:train:6901-7000batch: iter_time=1.059e-04, loss_ctc=68.606, loss_att=51.061, acc=0.719, loss=56.332, grad_norm=6.324, loss_scale=1.000, learning_rate=6.288e-05, step_time=0.329 [cnode7-012:0/16] 2024-12-08 23:56:59,744 (deepspeed_trainer:228) INFO: 41epoch:train:7001-7100batch: iter_time=1.087e-04, loss_ctc=65.737, loss_att=48.205, acc=0.730, loss=53.461, grad_norm=5.521, loss_scale=1.000, learning_rate=6.288e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:57:32,433 (deepspeed_trainer:228) INFO: 41epoch:train:7101-7200batch: iter_time=1.091e-04, loss_ctc=71.200, loss_att=48.212, acc=0.732, loss=55.136, grad_norm=6.196, loss_scale=1.000, learning_rate=6.287e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-08 23:58:04,854 (deepspeed_trainer:228) INFO: 41epoch:train:7201-7300batch: iter_time=1.079e-04, loss_ctc=55.853, loss_att=44.658, acc=0.719, loss=48.012, grad_norm=5.278, loss_scale=1.000, learning_rate=6.287e-05, step_time=0.324 [cnode7-012:0/16] 2024-12-08 23:58:37,579 (deepspeed_trainer:228) INFO: 41epoch:train:7301-7400batch: iter_time=1.119e-04, loss_ctc=67.869, loss_att=52.971, acc=0.715, loss=57.433, grad_norm=6.160, loss_scale=1.000, learning_rate=6.286e-05, step_time=0.327 [cnode7-012:0/16] 2024-12-08 23:59:10,438 (deepspeed_trainer:228) INFO: 41epoch:train:7401-7500batch: iter_time=1.096e-04, loss_ctc=72.041, loss_att=53.169, acc=0.727, loss=58.821, grad_norm=5.954, loss_scale=1.000, learning_rate=6.286e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-08 23:59:14,027 (multiple_iter_factory:32) INFO: Building 4th iter-factory... [cnode7-012:0/16] 2024-12-08 23:59:40,768 (s2t:444) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') [cnode7-012:0/16] 2024-12-08 23:59:58,056 (abs_task:1807) INFO: [train] dataset: ESPnetDataset( speech: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/wav.scp/split.0", "type": "kaldi_ark"} text_prev: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.prev/split.0", "type": "text"} text_ctc: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text.ctc/split.0", "type": "text"} text: {"path": "exp_owsm/s2t_stats_raw_bpe50000/splits8/text/split.0", "type": "text"} preprocess: ) [cnode7-012:0/16] 2024-12-08 23:59:58,057 (abs_task:1808) INFO: [train] Batch sampler: SortedBatchSampler(N-batch=28521, batch_size=256, shape_file=exp_owsm/s2t_stats_raw_bpe50000/splits8/speech_shape/split.0, sort_in_batch=descending, sort_batch=descending) [cnode7-012:0/16] 2024-12-08 23:59:58,059 (abs_task:1809) INFO: [train] mini-batch sizes summary: N-batch=28521, mean=256.0, min=256, max=257 [2024-12-09 00:00:20,792] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:21,045] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:22,065] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:22,755] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:23,032] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:23,081] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:25,626] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:23,452] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:23,463] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:26,001] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:27,079] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:27,896] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:27,955] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:27,979] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:28,202] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:00:28,647] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:10,102] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:10,437] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:10,780] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:11,027] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:14,014] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:11,764] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:14,530] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:12,517] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:12,797] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:16,793] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:17,357] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:17,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:17,984] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:18,026] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:18,297] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:16,199] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:01:59,996] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:01,022] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:01,170] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:01,400] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:01,857] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:01,956] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:04,553] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:05,010] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:02,722] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:07,204] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:07,614] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:07,860] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:08,333] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:08,914] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:09,142] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:06,814] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:50,239] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:51,011] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:51,094] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:51,387] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:54,059] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:51,536] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:51,679] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:51,826] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:55,607] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:54,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:58,446] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:58,479] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:59,692] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:02:59,899] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:03:00,352] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-12-09 00:03:00,355] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) [cnode7-012:0/16] 2024-12-09 00:04:03,058 (deepspeed_trainer:228) INFO: 41epoch:train:7501-7600batch: iter_time=2.568, loss_ctc=69.911, loss_att=55.862, acc=0.727, loss=60.074, grad_norm=6.110, loss_scale=1.000, learning_rate=6.285e-05, step_time=0.358 [cnode7-012:0/16] 2024-12-09 00:04:37,129 (deepspeed_trainer:228) INFO: 41epoch:train:7601-7700batch: iter_time=1.096e-04, loss_ctc=82.234, loss_att=57.936, acc=0.730, loss=65.207, grad_norm=7.756, loss_scale=1.000, learning_rate=6.285e-05, step_time=0.340 [cnode7-012:0/16] 2024-12-09 00:05:10,769 (deepspeed_trainer:228) INFO: 41epoch:train:7701-7800batch: iter_time=1.079e-04, loss_ctc=70.222, loss_att=52.295, acc=0.726, loss=57.673, grad_norm=5.892, loss_scale=1.000, learning_rate=6.284e-05, step_time=0.336 [cnode7-012:0/16] 2024-12-09 00:05:44,275 (deepspeed_trainer:228) INFO: 41epoch:train:7801-7900batch: iter_time=1.136e-04, loss_ctc=66.196, loss_att=50.925, acc=0.725, loss=55.496, grad_norm=5.462, loss_scale=1.000, learning_rate=6.284e-05, step_time=0.335 [2024-12-09 00:06:17,621] [INFO] [logging.py:129:log_dist] [Rank 0] step=608000, skipped=0, lr=[np.float64(6.282803457598398e-05)], mom=[[0.9, 0.98]] [2024-12-09 00:06:17,622] [INFO] [timer.py:264:stop] epoch=0/micro_step=218000/global_step=218000, RunningAvgSamplesPerSec=49.20092467904893, CurrSamplesPerSec=52.61714494877622, MemAllocated=2.04GB, MaxMemAllocated=35.33GB [cnode7-012:0/16] 2024-12-09 00:06:17,625 (deepspeed_trainer:228) INFO: 41epoch:train:7901-8000batch: iter_time=1.142e-04, loss_ctc=68.297, loss_att=50.513, acc=0.741, loss=55.868, grad_norm=5.427, loss_scale=1.000, learning_rate=6.283e-05, step_time=0.333 [cnode7-012:0/16] 2024-12-09 00:06:50,456 (deepspeed_trainer:228) INFO: 41epoch:train:8001-8100batch: iter_time=1.142e-04, loss_ctc=63.838, loss_att=46.241, acc=0.731, loss=51.511, grad_norm=5.438, loss_scale=1.000, learning_rate=6.283e-05, step_time=0.328 [cnode7-012:0/16] 2024-12-09 00:07:23,084 (deepspeed_trainer:228) INFO: 41epoch:train:8101-8200batch: iter_time=1.136e-04, loss_ctc=61.661, loss_att=44.083, acc=0.741, loss=49.328, grad_norm=5.017, loss_scale=1.000, learning_rate=6.282e-05, step_time=0.326 [cnode7-012:0/16] 2024-12-09 00:07:55,804 (deepspeed_trainer:228) INFO: 41epoch:train:8201-8300batch: iter_time=1.122e-04, loss_ctc=68.939, loss_att=50.059, acc=0.734, loss=55.725, grad_norm=5.582, loss_scale=1.000, learning_rate=6.282e-05, step_time=0.327 Process SpawnProcess-6: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 204, in train_one_epoch loss, stats, weight = model(**batch) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn ret_val = func(*args, **kwargs) File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1899, in forward loss = self.module(*inputs, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py", line 225, in forward loss_att, acc_att, cer_att, wer_att = self._calc_att_loss( File "/mnt/home/williamchen/espnet/espnet2/s2t/espnet_model.py", line 391, in _calc_att_loss decoder_out, _ = self.decoder( File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet2/asr/decoder/transformer_decoder.py", line 148, in forward x = self.embed(tgt) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/container.py", line 219, in forward input = module(input) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/espnet/espnet/nets/pytorch_backend/transformer/embedding.py", line 92, in forward return self.dropout(x) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/modules/dropout.py", line 59, in forward return F.dropout(input, self.p, self.training, self.inplace) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 1295, in dropout return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training) torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.73 GiB. GPU 5 has a total capacity of 79.11 GiB of which 1.31 GiB is free. Including non-PyTorch memory, this process has 77.79 GiB memory in use. Of the allocated memory 70.35 GiB is allocated by PyTorch, and 1.37 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) cnode7-012:1065357:1074373 [7] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065357:1066260 [7] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065351:1074372 [1] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065351:1066262 [1] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065356:1074374 [6] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065356:1066258 [6] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065352:1074370 [2] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065352:1066259 [2] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065350:1074384 [0] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065350:1066268 [0] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065354:1074371 [4] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065354:1066270 [4] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065353:1074379 [3] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-012:1065353:1066269 [3] NCCL INFO [Service thread] Connection closed by localRank 5 W1209 00:08:06.918000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065350 via signal SIGTERM W1209 00:08:06.919000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065351 via signal SIGTERM W1209 00:08:06.920000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065352 via signal SIGTERM W1209 00:08:06.920000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065353 via signal SIGTERM W1209 00:08:06.921000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065354 via signal SIGTERM W1209 00:08:06.921000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065356 via signal SIGTERM W1209 00:08:06.922000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 1065357 via signal SIGTERM Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 23, in main() File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 19, in main S2TTask.main(cmd=cmd) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1273, in main while not ProcessContext(processes, error_files).join(): File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 178, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 5 terminated with exit code 1 srun: error: cnode7-012: task 0: Exited with exit code 1 [rank9]:[E1209 00:17:55.872245043 ProcessGroupNCCL.cpp:607] [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. [rank13]:[E1209 00:17:55.873145573 ProcessGroupNCCL.cpp:607] [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. [rank10]:[E1209 00:17:55.873994206 ProcessGroupNCCL.cpp:607] [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. [rank12]:[E1209 00:17:55.874498857 ProcessGroupNCCL.cpp:607] [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. [rank14]:[E1209 00:17:55.874750537 ProcessGroupNCCL.cpp:607] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. [rank11]:[E1209 00:17:55.875630206 ProcessGroupNCCL.cpp:607] [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. [rank14]:[E1209 00:17:55.878805192 ProcessGroupNCCL.cpp:607] [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600008 milliseconds before timing out. [rank8]:[E1209 00:17:55.880015597 ProcessGroupNCCL.cpp:607] [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. [rank15]:[E1209 00:17:55.880935170 ProcessGroupNCCL.cpp:607] [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. [rank14]:[E1209 00:17:55.880688249 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 14] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank14]:[E1209 00:17:55.881001098 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 14] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank14]:[E1209 00:17:55.881005700 ProcessGroupNCCL.cpp:621] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank15]:[E1209 00:17:55.883803930 ProcessGroupNCCL.cpp:670] [Rank 15] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank8]:[E1209 00:17:55.883805051 ProcessGroupNCCL.cpp:670] [Rank 8] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank13]:[E1209 00:17:55.883805266 ProcessGroupNCCL.cpp:670] [Rank 13] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank14]:[E1209 00:17:55.883811257 ProcessGroupNCCL.cpp:670] [Rank 14] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank12]:[E1209 00:17:55.883816144 ProcessGroupNCCL.cpp:670] [Rank 12] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank9]:[E1209 00:17:55.883819102 ProcessGroupNCCL.cpp:670] [Rank 9] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank10]:[E1209 00:17:55.883833722 ProcessGroupNCCL.cpp:670] [Rank 10] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). [rank11]:[E1209 00:17:55.883838790 ProcessGroupNCCL.cpp:670] [Rank 11] Work WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1). cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 0 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 2 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595413:605207 [3] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595412:605206 [2] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595417:605209 [7] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 1 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 7 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595411:605214 [1] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595414:605216 [4] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595410:605205 [0] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595416:605208 [6] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 5 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 6 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 4 cnode7-013:595415:605213 [5] NCCL INFO [Service thread] Connection closed by localRank 3 cnode7-013:595413:595413 [3] NCCL INFO comm 0x5555bea42f00 rank 11 nranks 16 cudaDev 3 busId 61000 - Abort COMPLETE [rank11]:[E1209 00:17:56.342807589 ProcessGroupNCCL.cpp:621] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank11]:[E1209 00:17:56.342825835 ProcessGroupNCCL.cpp:627] [Rank 11] To avoid data inconsistency, we are taking the entire process down. cnode7-013:595416:595416 [6] NCCL INFO comm 0x5555d51cf510 rank 14 nranks 16 cudaDev 6 busId d1000 - Abort COMPLETE [rank14]:[E1209 00:17:56.347245747 ProcessGroupNCCL.cpp:621] [Rank 14] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank14]:[E1209 00:17:56.347262102 ProcessGroupNCCL.cpp:627] [Rank 14] To avoid data inconsistency, we are taking the entire process down. [rank11]:[E1209 00:17:56.352014307 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 11] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank11]:[E1209 00:17:56.352029104 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 11] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank11]:[E1209 00:17:56.352033503 ProcessGroupNCCL.cpp:621] [Rank 11] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. cnode7-013:595414:595414 [4] NCCL INFO comm 0x5555c11d8790 rank 12 nranks 16 cudaDev 4 busId 9d000 - Abort COMPLETE [rank12]:[E1209 00:17:56.369640063 ProcessGroupNCCL.cpp:621] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank12]:[E1209 00:17:56.369655194 ProcessGroupNCCL.cpp:627] [Rank 12] To avoid data inconsistency, we are taking the entire process down. [rank12]:[E1209 00:17:56.371478714 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 12] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank12]:[E1209 00:17:56.371492509 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 12] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank12]:[E1209 00:17:56.371498634 ProcessGroupNCCL.cpp:621] [Rank 12] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. cnode7-013:595411:595411 [1] NCCL INFO comm 0x5555c11d01f0 rank 9 nranks 16 cudaDev 1 busId 43000 - Abort COMPLETE [rank9]:[E1209 00:17:56.371495173 ProcessGroupNCCL.cpp:621] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank9]:[E1209 00:17:56.371508731 ProcessGroupNCCL.cpp:627] [Rank 9] To avoid data inconsistency, we are taking the entire process down. [rank9]:[E1209 00:17:56.373364957 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 9] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank9]:[E1209 00:17:56.373381182 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 9] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank9]:[E1209 00:17:56.373385394 ProcessGroupNCCL.cpp:621] [Rank 9] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. cnode7-013:595417:595417 [7] NCCL INFO comm 0x5555c11a5720 rank 15 nranks 16 cudaDev 7 busId df000 - Abort COMPLETE [rank15]:[E1209 00:17:56.413920236 ProcessGroupNCCL.cpp:621] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank15]:[E1209 00:17:56.413936231 ProcessGroupNCCL.cpp:627] [Rank 15] To avoid data inconsistency, we are taking the entire process down. [rank15]:[E1209 00:17:56.415886100 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 15] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank15]:[E1209 00:17:56.415900264 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 15] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank15]:[E1209 00:17:56.415905640 ProcessGroupNCCL.cpp:621] [Rank 15] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. cnode7-013:595412:595412 [2] NCCL INFO comm 0x5555c11d2200 rank 10 nranks 16 cudaDev 2 busId 52000 - Abort COMPLETE [rank10]:[E1209 00:17:56.428534449 ProcessGroupNCCL.cpp:621] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank10]:[E1209 00:17:56.428556651 ProcessGroupNCCL.cpp:627] [Rank 10] To avoid data inconsistency, we are taking the entire process down. [rank10]:[E1209 00:17:56.430606743 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 10] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank10]:[E1209 00:17:56.430621718 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 10] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank10]:[E1209 00:17:56.430626473 ProcessGroupNCCL.cpp:621] [Rank 10] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. cnode7-013:595410:595410 [0] NCCL INFO comm 0x5555b71d18d0 rank 8 nranks 16 cudaDev 0 busId 1b000 - Abort COMPLETE [rank8]:[E1209 00:17:56.555751863 ProcessGroupNCCL.cpp:621] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank8]:[E1209 00:17:56.555777982 ProcessGroupNCCL.cpp:627] [Rank 8] To avoid data inconsistency, we are taking the entire process down. [rank8]:[E1209 00:17:56.557880621 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 8] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank8]:[E1209 00:17:56.557897854 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 8] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank8]:[E1209 00:17:56.557903820 ProcessGroupNCCL.cpp:621] [Rank 8] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. cnode7-013:595415:595415 [5] NCCL INFO comm 0x5555c11d89d0 rank 13 nranks 16 cudaDev 5 busId c3000 - Abort COMPLETE [rank13]:[E1209 00:17:56.592721253 ProcessGroupNCCL.cpp:621] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank13]:[E1209 00:17:56.592740793 ProcessGroupNCCL.cpp:627] [Rank 13] To avoid data inconsistency, we are taking the entire process down. [rank13]:[E1209 00:17:56.594579858 ProcessGroupNCCL.cpp:1664] [PG 0 (default_pg) Rank 13] Exception (either an error or timeout) detected by watchdog at work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank13]:[E1209 00:17:56.594594023 ProcessGroupNCCL.cpp:1709] [PG 0 (default_pg) Rank 13] Timeout at NCCL work: 1091718, last enqueued NCCL work: 1091718, last completed NCCL work: 1091717. [rank13]:[E1209 00:17:56.594598513 ProcessGroupNCCL.cpp:621] [Rank 13] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. Process SpawnProcess-4: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 11] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. Process SpawnProcess-7: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 14] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600004 milliseconds before timing out. Process SpawnProcess-3: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 10] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600000 milliseconds before timing out. Process SpawnProcess-5: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 12] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. Process SpawnProcess-2: Process SpawnProcess-8: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 9] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600001 milliseconds before timing out. Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 15] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600002 milliseconds before timing out. Process SpawnProcess-1: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 8] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600005 milliseconds before timing out. Process SpawnProcess-6: Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1568, in main_worker cls.trainer.run( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 132, in run cls.train_one_epoch( File "/mnt/home/williamchen/espnet/espnet2/train/deepspeed_trainer.py", line 208, in train_one_epoch stats, weight = recursive_average(stats, weight, True) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 53, in recursive_average obj = recursive_sum(obj, weight, distributed) File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in recursive_sum return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 14, in return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} File "/mnt/home/williamchen/espnet/espnet2/torch_utils/recursive_op.py", line 22, in recursive_sum torch.distributed.all_gather(lst, obj) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3113, in all_gather work.wait() torch.distributed.DistBackendError: [Rank 13] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1091718, OpType=ALLGATHER, NumelIn=1, NumelOut=16, Timeout(ms)=600000) ran for 600003 milliseconds before timing out. W1209 00:18:06.364000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 595410 via signal SIGTERM W1209 00:18:06.365000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 595411 via signal SIGTERM W1209 00:18:06.365000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 595414 via signal SIGTERM W1209 00:18:06.365000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 595415 via signal SIGTERM W1209 00:18:06.365000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 595416 via signal SIGTERM W1209 00:18:06.365000 23456244418368 torch/multiprocessing/spawn.py:146] Terminating process 595417 via signal SIGTERM Traceback (most recent call last): File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/mnt/home/williamchen/espnet/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 23, in main() File "/mnt/home/williamchen/espnet/espnet2/bin/s2t_train.py", line 19, in main S2TTask.main(cmd=cmd) File "/mnt/home/williamchen/espnet/espnet2/tasks/abs_task.py", line 1273, in main while not ProcessContext(processes, error_files).join(): File "/mnt/home/williamchen/.local/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 178, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1 srun: error: cnode7-013: task 1: Exited with exit code 1 # Accounting: begin_time=1733570033 # Accounting: end_time=1733674692 # Accounting: time=104659 threads=1 # Finished at Mon Dec 9 00:18:12 CST 2024 with status 1