diff --git "a/diff_s_1000kx1024_qknorm/log.txt" "b/diff_s_1000kx1024_qknorm/log.txt" new file mode 100644--- /dev/null +++ "b/diff_s_1000kx1024_qknorm/log.txt" @@ -0,0 +1,10291 @@ +[2025-02-19 15:23:26] Model: DistributedDataParallel( + (module): DiT( + (x_embedder): PatchEmbed( + (proj): Conv2d(4, 384, kernel_size=(2, 2), stride=(2, 2)) + (norm): Identity() + ) + (t_embedder): TimestepEmbedder( + (mlp): Sequential( + (0): Linear(in_features=256, out_features=384, bias=True) + (1): SiLU() + (2): Linear(in_features=384, out_features=384, bias=True) + ) + ) + (y_embedder): LabelEmbedder( + (embedding_table): Embedding(1001, 384) + ) + (blocks): ModuleList( + (0-11): 12 x DiTBlock( + (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=False) + (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=False) + (attn): Attention( + (qkv): Linear(in_features=384, out_features=1152, bias=True) + (q_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True) + (k_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=384, out_features=384, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (mlp): Mlp( + (fc1): Linear(in_features=384, out_features=1536, bias=True) + (act): GELU(approximate='tanh') + (drop1): Dropout(p=0, inplace=False) + (norm): Identity() + (fc2): Linear(in_features=1536, out_features=384, bias=True) + (drop2): Dropout(p=0, inplace=False) + ) + (adaLN_modulation): Sequential( + (0): SiLU() + (1): Linear(in_features=384, out_features=2304, bias=True) + ) + ) + ) + (final_layer): FinalLayer( + (norm_final): LayerNorm((384,), eps=1e-06, elementwise_affine=False) + (linear): Linear(in_features=384, out_features=32, bias=True) + (adaLN_modulation): Sequential( + (0): SiLU() + (1): Linear(in_features=384, out_features=768, bias=True) + ) + ) + ) +) +[2025-02-19 15:23:26] DiT Parameters: 32.97M +[2025-02-19 15:23:26] DiT Trainable Parameters: 32.87M +[2025-02-19 15:23:26] Optimizer: AdamW, lr=0.0002, beta2=0.999 +[2025-02-19 15:23:26] module.pos_embed.requires_grad : False +[2025-02-19 15:23:26] module.x_embedder.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.x_embedder.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.t_embedder.mlp.0.weight.requires_grad : True +[2025-02-19 15:23:26] module.t_embedder.mlp.0.bias.requires_grad : True +[2025-02-19 15:23:26] module.t_embedder.mlp.2.weight.requires_grad : True +[2025-02-19 15:23:26] module.t_embedder.mlp.2.bias.requires_grad : True +[2025-02-19 15:23:26] module.y_embedder.embedding_table.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.0.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.1.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.2.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.3.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.4.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.5.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.6.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.7.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.8.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.9.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.10.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.qkv.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.qkv.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.q_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.q_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.k_norm.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.k_norm.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.proj.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.attn.proj.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.mlp.fc1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.mlp.fc1.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.mlp.fc2.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.mlp.fc2.bias.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.blocks.11.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:26] module.final_layer.linear.weight.requires_grad : True +[2025-02-19 15:23:26] module.final_layer.linear.bias.requires_grad : True +[2025-02-19 15:23:26] module.final_layer.adaLN_modulation.1.weight.requires_grad : True +[2025-02-19 15:23:26] module.final_layer.adaLN_modulation.1.bias.requires_grad : True +[2025-02-19 15:23:27] Dataset contains 1,281,168 images /data/checkpoints/LanguageBind/offline_feature/offline_vae_256/imagenet_train_256 +[2025-02-19 15:23:27] Batch size 128 per gpu, with 1024 global batch size +[2025-02-19 15:23:27] Train config: {'ckpt_path': '/data/logs/smo/diff_s_1000kx1024_qknorm/checkpoints/1000000.pt', 'data': {'data_path': '/data/checkpoints/LanguageBind/offline_feature/offline_vae_256/imagenet_train_256', 'fid_reference_file': '/data/checkpoints/VIRTUAL_imagenet256_labeled.npz', 'image_size': 256, 'num_classes': 1000, 'num_workers': 16, 'latent_norm': False, 'latent_multiplier': 0.18215}, 'vae': {'model_path': '/data/checkpoints/stabilityai/sd-vae-ft-ema', 'downsample_ratio': 8}, 'model': {'model_type': 'DiT-S/2', 'use_qknorm': True, 'use_swiglu': False, 'use_rope': False, 'use_rmsnorm': False, 'in_chans': 4, 'use_checkpoint': False}, 'train': {'max_steps': 1000000, 'global_batch_size': 1024, 'global_seed': 0, 'output_dir': '../logs/smo/diff_s_1000kx1024_qknorm', 'ckpt': None, 'log_every': 100, 'ckpt_every': 20000, 'wandb': True, 'seed': 1234, 'precision': 'bf16', 'resume': False}, 'optimizer': {'lr': 0.0002, 'beta2': 0.999}, 'wandb': {'proj_name': 'smo', 'log_name': 'diff_s_1000kx1024_qknorm', 'key': '953e958793b218efb850fa194e85843e2c3bd88b'}, 'scheduler': {'diffusion': True, 'transport': False}, 'diffusion': {'learn_sigma': True, 'diffusion_steps': 1000}, 'transport': {'path_type': 'Linear', 'prediction': 'velocity', 'loss_weight': None, 'sample_eps': None, 'train_eps': None, 'use_cosine_loss': True, 'use_lognorm': True}, 'sample': {'num_sampling_steps': 250, 'cfg_scale': 1.0, 'per_proc_batch_size': 64, 'fid_num': 50000, 'cfg_interval_start': 0.0}} +[2025-02-19 15:23:51] (step=0000100) Train Loss: 0.3625, Train Steps/Sec: 4.14, Grad Norm: 0.5866 +[2025-02-19 15:24:00] (step=0000200) Train Loss: 0.2040, Train Steps/Sec: 11.46, Grad Norm: 0.3173 +[2025-02-19 15:24:08] (step=0000300) Train Loss: 0.1952, Train Steps/Sec: 11.43, Grad Norm: 0.2793 +[2025-02-19 15:24:17] (step=0000400) Train Loss: 0.1917, Train Steps/Sec: 11.42, Grad Norm: 0.2862 +[2025-02-19 15:24:26] (step=0000500) Train Loss: 0.1899, Train Steps/Sec: 11.41, Grad Norm: 0.2927 +[2025-02-19 15:24:35] (step=0000600) Train Loss: 0.1887, Train Steps/Sec: 11.42, Grad Norm: 0.2960 +[2025-02-19 15:24:43] (step=0000700) Train Loss: 0.1859, Train Steps/Sec: 11.41, Grad Norm: 0.2836 +[2025-02-19 15:24:52] (step=0000800) Train Loss: 0.1849, Train Steps/Sec: 11.42, Grad Norm: 0.3035 +[2025-02-19 15:25:01] (step=0000900) Train Loss: 0.1835, Train Steps/Sec: 10.89, Grad Norm: 0.2324 +[2025-02-19 15:25:11] (step=0001000) Train Loss: 0.1811, Train Steps/Sec: 9.92, Grad Norm: 0.2428 +[2025-02-19 15:25:20] (step=0001100) Train Loss: 0.1816, Train Steps/Sec: 11.37, Grad Norm: 0.2250 +[2025-02-19 15:25:29] (step=0001200) Train Loss: 0.1800, Train Steps/Sec: 11.39, Grad Norm: 0.2076 +[2025-02-19 15:25:40] (step=0001300) Train Loss: 0.1789, Train Steps/Sec: 9.45, Grad Norm: 0.1979 +[2025-02-19 15:25:48] (step=0001400) Train Loss: 0.1795, Train Steps/Sec: 11.57, Grad Norm: 0.1927 +[2025-02-19 15:25:57] (step=0001500) Train Loss: 0.1771, Train Steps/Sec: 11.57, Grad Norm: 0.1867 +[2025-02-19 15:26:06] (step=0001600) Train Loss: 0.1757, Train Steps/Sec: 11.57, Grad Norm: 0.1696 +[2025-02-19 15:26:14] (step=0001700) Train Loss: 0.1754, Train Steps/Sec: 11.58, Grad Norm: 0.1713 +[2025-02-19 15:26:23] (step=0001800) Train Loss: 0.1752, Train Steps/Sec: 11.57, Grad Norm: 0.1652 +[2025-02-19 15:26:32] (step=0001900) Train Loss: 0.1752, Train Steps/Sec: 11.57, Grad Norm: 0.1674 +[2025-02-19 15:26:40] (step=0002000) Train Loss: 0.1752, Train Steps/Sec: 11.57, Grad Norm: 0.1626 +[2025-02-19 15:26:49] (step=0002100) Train Loss: 0.1739, Train Steps/Sec: 11.57, Grad Norm: 0.1512 +[2025-02-19 15:26:57] (step=0002200) Train Loss: 0.1718, Train Steps/Sec: 11.58, Grad Norm: 0.1503 +[2025-02-19 15:27:06] (step=0002300) Train Loss: 0.1717, Train Steps/Sec: 11.59, Grad Norm: 0.1508 +[2025-02-19 15:27:15] (step=0002400) Train Loss: 0.1720, Train Steps/Sec: 11.58, Grad Norm: 0.1472 +[2025-02-19 15:27:23] (step=0002500) Train Loss: 0.1723, Train Steps/Sec: 11.53, Grad Norm: 0.1414 +[2025-02-19 15:27:33] (step=0002600) Train Loss: 0.1730, Train Steps/Sec: 10.06, Grad Norm: 0.1401 +[2025-02-19 15:27:42] (step=0002700) Train Loss: 0.1719, Train Steps/Sec: 11.60, Grad Norm: 0.1275 +[2025-02-19 15:27:51] (step=0002800) Train Loss: 0.1708, Train Steps/Sec: 11.61, Grad Norm: 0.1389 +[2025-02-19 15:27:59] (step=0002900) Train Loss: 0.1711, Train Steps/Sec: 11.62, Grad Norm: 0.1308 +[2025-02-19 15:28:08] (step=0003000) Train Loss: 0.1712, Train Steps/Sec: 11.62, Grad Norm: 0.1296 +[2025-02-19 15:28:17] (step=0003100) Train Loss: 0.1709, Train Steps/Sec: 11.21, Grad Norm: 0.1217 +[2025-02-19 15:28:27] (step=0003200) Train Loss: 0.1717, Train Steps/Sec: 10.01, Grad Norm: 0.1275 +[2025-02-19 15:28:35] (step=0003300) Train Loss: 0.1703, Train Steps/Sec: 11.63, Grad Norm: 0.1138 +[2025-02-19 15:28:44] (step=0003400) Train Loss: 0.1711, Train Steps/Sec: 11.63, Grad Norm: 0.1198 +[2025-02-19 15:28:52] (step=0003500) Train Loss: 0.1703, Train Steps/Sec: 11.63, Grad Norm: 0.1201 +[2025-02-19 15:29:01] (step=0003600) Train Loss: 0.1687, Train Steps/Sec: 11.63, Grad Norm: 0.1132 +[2025-02-19 15:29:10] (step=0003700) Train Loss: 0.1689, Train Steps/Sec: 11.63, Grad Norm: 0.1132 +[2025-02-19 15:29:20] (step=0003800) Train Loss: 0.1682, Train Steps/Sec: 10.01, Grad Norm: 0.1110 +[2025-02-19 15:29:28] (step=0003900) Train Loss: 0.1686, Train Steps/Sec: 11.63, Grad Norm: 0.1051 +[2025-02-19 15:29:37] (step=0004000) Train Loss: 0.1680, Train Steps/Sec: 11.63, Grad Norm: 0.1021 +[2025-02-19 15:29:45] (step=0004100) Train Loss: 0.1672, Train Steps/Sec: 11.62, Grad Norm: 0.1053 +[2025-02-19 15:29:54] (step=0004200) Train Loss: 0.1675, Train Steps/Sec: 11.63, Grad Norm: 0.1069 +[2025-02-19 15:30:03] (step=0004300) Train Loss: 0.1680, Train Steps/Sec: 11.63, Grad Norm: 0.1052 +[2025-02-19 15:30:11] (step=0004400) Train Loss: 0.1674, Train Steps/Sec: 11.63, Grad Norm: 0.0988 +[2025-02-19 15:30:20] (step=0004500) Train Loss: 0.1682, Train Steps/Sec: 11.64, Grad Norm: 0.0957 +[2025-02-19 15:30:28] (step=0004600) Train Loss: 0.1687, Train Steps/Sec: 11.63, Grad Norm: 0.1017 +[2025-02-19 15:30:37] (step=0004700) Train Loss: 0.1672, Train Steps/Sec: 11.63, Grad Norm: 0.0922 +[2025-02-19 15:30:46] (step=0004800) Train Loss: 0.1677, Train Steps/Sec: 11.62, Grad Norm: 0.0946 +[2025-02-19 15:30:54] (step=0004900) Train Loss: 0.1677, Train Steps/Sec: 11.61, Grad Norm: 0.0954 +[2025-02-19 15:31:03] (step=0005000) Train Loss: 0.1675, Train Steps/Sec: 11.57, Grad Norm: 0.0894 +[2025-02-19 15:31:13] (step=0005100) Train Loss: 0.1663, Train Steps/Sec: 10.04, Grad Norm: 0.0926 +[2025-02-19 15:31:21] (step=0005200) Train Loss: 0.1666, Train Steps/Sec: 11.62, Grad Norm: 0.0924 +[2025-02-19 15:31:30] (step=0005300) Train Loss: 0.1668, Train Steps/Sec: 11.24, Grad Norm: 0.0918 +[2025-02-19 15:31:40] (step=0005400) Train Loss: 0.1674, Train Steps/Sec: 10.29, Grad Norm: 0.0823 +[2025-02-19 15:31:49] (step=0005500) Train Loss: 0.1666, Train Steps/Sec: 11.63, Grad Norm: 0.0853 +[2025-02-19 15:31:57] (step=0005600) Train Loss: 0.1661, Train Steps/Sec: 11.63, Grad Norm: 0.0846 +[2025-02-19 15:32:06] (step=0005700) Train Loss: 0.1651, Train Steps/Sec: 11.63, Grad Norm: 0.0862 +[2025-02-19 15:32:15] (step=0005800) Train Loss: 0.1661, Train Steps/Sec: 11.63, Grad Norm: 0.0839 +[2025-02-19 15:32:23] (step=0005900) Train Loss: 0.1667, Train Steps/Sec: 11.62, Grad Norm: 0.0783 +[2025-02-19 15:32:32] (step=0006000) Train Loss: 0.1669, Train Steps/Sec: 11.63, Grad Norm: 0.0777 +[2025-02-19 15:32:40] (step=0006100) Train Loss: 0.1647, Train Steps/Sec: 11.63, Grad Norm: 0.0824 +[2025-02-19 15:32:49] (step=0006200) Train Loss: 0.1666, Train Steps/Sec: 11.62, Grad Norm: 0.0769 +[2025-02-19 15:32:59] (step=0006300) Train Loss: 0.1657, Train Steps/Sec: 10.05, Grad Norm: 0.0814 +[2025-02-19 15:33:07] (step=0006400) Train Loss: 0.1652, Train Steps/Sec: 11.63, Grad Norm: 0.0754 +[2025-02-19 15:33:16] (step=0006500) Train Loss: 0.1651, Train Steps/Sec: 11.62, Grad Norm: 0.0791 +[2025-02-19 15:33:25] (step=0006600) Train Loss: 0.1657, Train Steps/Sec: 11.63, Grad Norm: 0.0763 +[2025-02-19 15:33:33] (step=0006700) Train Loss: 0.1657, Train Steps/Sec: 11.63, Grad Norm: 0.0742 +[2025-02-19 15:33:42] (step=0006800) Train Loss: 0.1656, Train Steps/Sec: 11.65, Grad Norm: 0.0745 +[2025-02-19 15:33:50] (step=0006900) Train Loss: 0.1652, Train Steps/Sec: 11.64, Grad Norm: 0.0748 +[2025-02-19 15:33:59] (step=0007000) Train Loss: 0.1652, Train Steps/Sec: 11.65, Grad Norm: 0.0682 +[2025-02-19 15:34:08] (step=0007100) Train Loss: 0.1658, Train Steps/Sec: 11.64, Grad Norm: 0.0737 +[2025-02-19 15:34:16] (step=0007200) Train Loss: 0.1643, Train Steps/Sec: 11.65, Grad Norm: 0.0680 +[2025-02-19 15:34:25] (step=0007300) Train Loss: 0.1650, Train Steps/Sec: 11.64, Grad Norm: 0.0699 +[2025-02-19 15:34:33] (step=0007400) Train Loss: 0.1651, Train Steps/Sec: 11.64, Grad Norm: 0.0687 +[2025-02-19 15:34:42] (step=0007500) Train Loss: 0.1647, Train Steps/Sec: 11.19, Grad Norm: 0.0672 +[2025-02-19 15:34:54] (step=0007600) Train Loss: 0.1648, Train Steps/Sec: 8.81, Grad Norm: 0.0703 +[2025-02-19 15:35:02] (step=0007700) Train Loss: 0.1635, Train Steps/Sec: 11.61, Grad Norm: 0.0671 +[2025-02-19 15:35:11] (step=0007800) Train Loss: 0.1634, Train Steps/Sec: 11.60, Grad Norm: 0.0680 +[2025-02-19 15:35:20] (step=0007900) Train Loss: 0.1644, Train Steps/Sec: 11.59, Grad Norm: 0.0636 +[2025-02-19 15:35:28] (step=0008000) Train Loss: 0.1629, Train Steps/Sec: 11.60, Grad Norm: 0.0580 +[2025-02-19 15:35:37] (step=0008100) Train Loss: 0.1634, Train Steps/Sec: 11.60, Grad Norm: 0.0614 +[2025-02-19 15:35:45] (step=0008200) Train Loss: 0.1622, Train Steps/Sec: 11.61, Grad Norm: 0.0647 +[2025-02-19 15:35:54] (step=0008300) Train Loss: 0.1636, Train Steps/Sec: 11.61, Grad Norm: 0.0674 +[2025-02-19 15:36:03] (step=0008400) Train Loss: 0.1634, Train Steps/Sec: 11.60, Grad Norm: 0.0618 +[2025-02-19 15:36:11] (step=0008500) Train Loss: 0.1635, Train Steps/Sec: 11.60, Grad Norm: 0.0596 +[2025-02-19 15:36:20] (step=0008600) Train Loss: 0.1652, Train Steps/Sec: 11.60, Grad Norm: 0.0640 +[2025-02-19 15:36:29] (step=0008700) Train Loss: 0.1632, Train Steps/Sec: 11.61, Grad Norm: 0.0558 +[2025-02-19 15:36:39] (step=0008800) Train Loss: 0.1621, Train Steps/Sec: 9.79, Grad Norm: 0.0580 +[2025-02-19 15:36:47] (step=0008900) Train Loss: 0.1638, Train Steps/Sec: 11.60, Grad Norm: 0.0607 +[2025-02-19 15:36:56] (step=0009000) Train Loss: 0.1642, Train Steps/Sec: 11.59, Grad Norm: 0.0615 +[2025-02-19 15:37:05] (step=0009100) Train Loss: 0.1635, Train Steps/Sec: 11.60, Grad Norm: 0.0593 +[2025-02-19 15:37:13] (step=0009200) Train Loss: 0.1635, Train Steps/Sec: 11.59, Grad Norm: 0.0588 +[2025-02-19 15:37:22] (step=0009300) Train Loss: 0.1622, Train Steps/Sec: 11.60, Grad Norm: 0.0593 +[2025-02-19 15:37:30] (step=0009400) Train Loss: 0.1639, Train Steps/Sec: 11.62, Grad Norm: 0.0583 +[2025-02-19 15:37:39] (step=0009500) Train Loss: 0.1613, Train Steps/Sec: 11.62, Grad Norm: 0.0577 +[2025-02-19 15:37:48] (step=0009600) Train Loss: 0.1623, Train Steps/Sec: 11.21, Grad Norm: 0.0564 +[2025-02-19 15:37:58] (step=0009700) Train Loss: 0.1644, Train Steps/Sec: 10.53, Grad Norm: 0.0571 +[2025-02-19 15:38:06] (step=0009800) Train Loss: 0.1636, Train Steps/Sec: 11.24, Grad Norm: 0.0564 +[2025-02-19 15:38:15] (step=0009900) Train Loss: 0.1627, Train Steps/Sec: 11.58, Grad Norm: 0.0569 +[2025-02-19 15:38:24] (step=0010000) Train Loss: 0.1616, Train Steps/Sec: 11.51, Grad Norm: 0.0557 +[2025-02-19 15:38:34] (step=0010100) Train Loss: 0.1640, Train Steps/Sec: 10.01, Grad Norm: 0.0551 +[2025-02-19 15:38:42] (step=0010200) Train Loss: 0.1617, Train Steps/Sec: 11.60, Grad Norm: 0.0509 +[2025-02-19 15:38:51] (step=0010300) Train Loss: 0.1631, Train Steps/Sec: 11.60, Grad Norm: 0.0539 +[2025-02-19 15:39:00] (step=0010400) Train Loss: 0.1628, Train Steps/Sec: 11.60, Grad Norm: 0.0527 +[2025-02-19 15:39:08] (step=0010500) Train Loss: 0.1620, Train Steps/Sec: 11.59, Grad Norm: 0.0529 +[2025-02-19 15:39:17] (step=0010600) Train Loss: 0.1624, Train Steps/Sec: 11.59, Grad Norm: 0.0539 +[2025-02-19 15:39:25] (step=0010700) Train Loss: 0.1632, Train Steps/Sec: 11.60, Grad Norm: 0.0523 +[2025-02-19 15:39:34] (step=0010800) Train Loss: 0.1637, Train Steps/Sec: 11.60, Grad Norm: 0.0512 +[2025-02-19 15:39:43] (step=0010900) Train Loss: 0.1627, Train Steps/Sec: 11.59, Grad Norm: 0.0529 +[2025-02-19 15:39:51] (step=0011000) Train Loss: 0.1620, Train Steps/Sec: 11.60, Grad Norm: 0.0491 +[2025-02-19 15:40:00] (step=0011100) Train Loss: 0.1615, Train Steps/Sec: 11.60, Grad Norm: 0.0487 +[2025-02-19 15:40:09] (step=0011200) Train Loss: 0.1611, Train Steps/Sec: 11.60, Grad Norm: 0.0493 +[2025-02-19 15:40:19] (step=0011300) Train Loss: 0.1613, Train Steps/Sec: 10.01, Grad Norm: 0.0482 +[2025-02-19 15:40:27] (step=0011400) Train Loss: 0.1626, Train Steps/Sec: 11.61, Grad Norm: 0.0485 +[2025-02-19 15:40:36] (step=0011500) Train Loss: 0.1611, Train Steps/Sec: 11.61, Grad Norm: 0.0456 +[2025-02-19 15:40:44] (step=0011600) Train Loss: 0.1625, Train Steps/Sec: 11.61, Grad Norm: 0.0515 +[2025-02-19 15:40:53] (step=0011700) Train Loss: 0.1628, Train Steps/Sec: 11.61, Grad Norm: 0.0506 +[2025-02-19 15:41:02] (step=0011800) Train Loss: 0.1619, Train Steps/Sec: 11.22, Grad Norm: 0.0438 +[2025-02-19 15:41:12] (step=0011900) Train Loss: 0.1619, Train Steps/Sec: 10.05, Grad Norm: 0.0490 +[2025-02-19 15:41:21] (step=0012000) Train Loss: 0.1604, Train Steps/Sec: 11.61, Grad Norm: 0.0479 +[2025-02-19 15:41:29] (step=0012100) Train Loss: 0.1607, Train Steps/Sec: 11.61, Grad Norm: 0.0456 +[2025-02-19 15:41:38] (step=0012200) Train Loss: 0.1614, Train Steps/Sec: 11.62, Grad Norm: 0.0460 +[2025-02-19 15:41:46] (step=0012300) Train Loss: 0.1619, Train Steps/Sec: 11.61, Grad Norm: 0.0456 +[2025-02-19 15:41:55] (step=0012400) Train Loss: 0.1607, Train Steps/Sec: 11.62, Grad Norm: 0.0454 +[2025-02-19 15:42:04] (step=0012500) Train Loss: 0.1610, Train Steps/Sec: 11.55, Grad Norm: 0.0444 +[2025-02-19 15:42:14] (step=0012600) Train Loss: 0.1632, Train Steps/Sec: 10.05, Grad Norm: 0.0455 +[2025-02-19 15:42:22] (step=0012700) Train Loss: 0.1609, Train Steps/Sec: 11.61, Grad Norm: 0.0429 +[2025-02-19 15:42:31] (step=0012800) Train Loss: 0.1608, Train Steps/Sec: 11.61, Grad Norm: 0.0438 +[2025-02-19 15:42:39] (step=0012900) Train Loss: 0.1592, Train Steps/Sec: 11.61, Grad Norm: 0.0461 +[2025-02-19 15:42:48] (step=0013000) Train Loss: 0.1605, Train Steps/Sec: 11.60, Grad Norm: 0.0446 +[2025-02-19 15:42:57] (step=0013100) Train Loss: 0.1601, Train Steps/Sec: 11.58, Grad Norm: 0.0441 +[2025-02-19 15:43:05] (step=0013200) Train Loss: 0.1618, Train Steps/Sec: 11.60, Grad Norm: 0.0441 +[2025-02-19 15:43:14] (step=0013300) Train Loss: 0.1608, Train Steps/Sec: 11.60, Grad Norm: 0.0432 +[2025-02-19 15:43:23] (step=0013400) Train Loss: 0.1611, Train Steps/Sec: 11.59, Grad Norm: 0.0424 +[2025-02-19 15:43:31] (step=0013500) Train Loss: 0.1614, Train Steps/Sec: 11.60, Grad Norm: 0.0459 +[2025-02-19 15:43:40] (step=0013600) Train Loss: 0.1610, Train Steps/Sec: 11.61, Grad Norm: 0.0427 +[2025-02-19 15:43:48] (step=0013700) Train Loss: 0.1607, Train Steps/Sec: 11.61, Grad Norm: 0.0421 +[2025-02-19 15:43:58] (step=0013800) Train Loss: 0.1597, Train Steps/Sec: 9.96, Grad Norm: 0.0428 +[2025-02-19 15:44:07] (step=0013900) Train Loss: 0.1595, Train Steps/Sec: 11.60, Grad Norm: 0.0391 +[2025-02-19 15:44:16] (step=0014000) Train Loss: 0.1613, Train Steps/Sec: 11.22, Grad Norm: 0.0408 +[2025-02-19 15:44:26] (step=0014100) Train Loss: 0.1600, Train Steps/Sec: 9.76, Grad Norm: 0.0391 +[2025-02-19 15:44:35] (step=0014200) Train Loss: 0.1601, Train Steps/Sec: 11.61, Grad Norm: 0.0409 +[2025-02-19 15:44:43] (step=0014300) Train Loss: 0.1596, Train Steps/Sec: 11.62, Grad Norm: 0.0414 +[2025-02-19 15:44:52] (step=0014400) Train Loss: 0.1620, Train Steps/Sec: 11.62, Grad Norm: 0.0390 +[2025-02-19 15:45:01] (step=0014500) Train Loss: 0.1608, Train Steps/Sec: 11.62, Grad Norm: 0.0406 +[2025-02-19 15:45:09] (step=0014600) Train Loss: 0.1602, Train Steps/Sec: 11.61, Grad Norm: 0.0391 +[2025-02-19 15:45:18] (step=0014700) Train Loss: 0.1600, Train Steps/Sec: 11.60, Grad Norm: 0.0387 +[2025-02-19 15:45:27] (step=0014800) Train Loss: 0.1605, Train Steps/Sec: 11.60, Grad Norm: 0.0395 +[2025-02-19 15:45:35] (step=0014900) Train Loss: 0.1584, Train Steps/Sec: 11.59, Grad Norm: 0.0401 +[2025-02-19 15:45:44] (step=0015000) Train Loss: 0.1595, Train Steps/Sec: 11.52, Grad Norm: 0.0396 +[2025-02-19 15:45:54] (step=0015100) Train Loss: 0.1601, Train Steps/Sec: 10.03, Grad Norm: 0.0374 +[2025-02-19 15:46:02] (step=0015200) Train Loss: 0.1596, Train Steps/Sec: 11.56, Grad Norm: 0.0390 +[2025-02-19 15:46:11] (step=0015300) Train Loss: 0.1617, Train Steps/Sec: 11.56, Grad Norm: 0.0380 +[2025-02-19 15:46:20] (step=0015400) Train Loss: 0.1598, Train Steps/Sec: 11.56, Grad Norm: 0.0374 +[2025-02-19 15:46:28] (step=0015500) Train Loss: 0.1608, Train Steps/Sec: 11.56, Grad Norm: 0.0373 +[2025-02-19 15:46:37] (step=0015600) Train Loss: 0.1594, Train Steps/Sec: 11.56, Grad Norm: 0.0373 +[2025-02-19 15:46:46] (step=0015700) Train Loss: 0.1601, Train Steps/Sec: 11.56, Grad Norm: 0.0383 +[2025-02-19 15:46:54] (step=0015800) Train Loss: 0.1594, Train Steps/Sec: 11.56, Grad Norm: 0.0359 +[2025-02-19 15:47:03] (step=0015900) Train Loss: 0.1591, Train Steps/Sec: 11.55, Grad Norm: 0.0355 +[2025-02-19 15:47:12] (step=0016000) Train Loss: 0.1596, Train Steps/Sec: 11.55, Grad Norm: 0.0347 +[2025-02-19 15:47:20] (step=0016100) Train Loss: 0.1608, Train Steps/Sec: 11.56, Grad Norm: 0.0385 +[2025-02-19 15:47:29] (step=0016200) Train Loss: 0.1587, Train Steps/Sec: 11.17, Grad Norm: 0.0360 +[2025-02-19 15:47:41] (step=0016300) Train Loss: 0.1590, Train Steps/Sec: 8.61, Grad Norm: 0.0360 +[2025-02-19 15:47:50] (step=0016400) Train Loss: 0.1594, Train Steps/Sec: 11.60, Grad Norm: 0.0359 +[2025-02-19 15:47:58] (step=0016500) Train Loss: 0.1610, Train Steps/Sec: 11.60, Grad Norm: 0.0354 +[2025-02-19 15:48:07] (step=0016600) Train Loss: 0.1601, Train Steps/Sec: 11.59, Grad Norm: 0.0343 +[2025-02-19 15:48:15] (step=0016700) Train Loss: 0.1585, Train Steps/Sec: 11.61, Grad Norm: 0.0348 +[2025-02-19 15:48:24] (step=0016800) Train Loss: 0.1581, Train Steps/Sec: 11.60, Grad Norm: 0.0357 +[2025-02-19 15:48:33] (step=0016900) Train Loss: 0.1589, Train Steps/Sec: 11.60, Grad Norm: 0.0368 +[2025-02-19 15:48:41] (step=0017000) Train Loss: 0.1591, Train Steps/Sec: 11.59, Grad Norm: 0.0360 +[2025-02-19 15:48:50] (step=0017100) Train Loss: 0.1616, Train Steps/Sec: 11.61, Grad Norm: 0.0333 +[2025-02-19 15:48:59] (step=0017200) Train Loss: 0.1580, Train Steps/Sec: 11.60, Grad Norm: 0.0358 +[2025-02-19 15:49:07] (step=0017300) Train Loss: 0.1608, Train Steps/Sec: 11.60, Grad Norm: 0.0367 +[2025-02-19 15:49:16] (step=0017400) Train Loss: 0.1583, Train Steps/Sec: 11.60, Grad Norm: 0.0354 +[2025-02-19 15:49:24] (step=0017500) Train Loss: 0.1604, Train Steps/Sec: 11.55, Grad Norm: 0.0346 +[2025-02-19 15:49:34] (step=0017600) Train Loss: 0.1591, Train Steps/Sec: 10.10, Grad Norm: 0.0327 +[2025-02-19 15:49:43] (step=0017700) Train Loss: 0.1600, Train Steps/Sec: 11.62, Grad Norm: 0.0336 +[2025-02-19 15:49:52] (step=0017800) Train Loss: 0.1604, Train Steps/Sec: 11.63, Grad Norm: 0.0329 +[2025-02-19 15:50:00] (step=0017900) Train Loss: 0.1589, Train Steps/Sec: 11.62, Grad Norm: 0.0328 +[2025-02-19 15:50:09] (step=0018000) Train Loss: 0.1587, Train Steps/Sec: 11.63, Grad Norm: 0.0328 +[2025-02-19 15:50:17] (step=0018100) Train Loss: 0.1584, Train Steps/Sec: 11.62, Grad Norm: 0.0320 +[2025-02-19 15:50:26] (step=0018200) Train Loss: 0.1578, Train Steps/Sec: 11.62, Grad Norm: 0.0309 +[2025-02-19 15:50:35] (step=0018300) Train Loss: 0.1599, Train Steps/Sec: 11.23, Grad Norm: 0.0319 +[2025-02-19 15:50:45] (step=0018400) Train Loss: 0.1588, Train Steps/Sec: 10.02, Grad Norm: 0.0338 +[2025-02-19 15:50:54] (step=0018500) Train Loss: 0.1583, Train Steps/Sec: 11.25, Grad Norm: 0.0313 +[2025-02-19 15:51:02] (step=0018600) Train Loss: 0.1604, Train Steps/Sec: 11.60, Grad Norm: 0.0315 +[2025-02-19 15:51:11] (step=0018700) Train Loss: 0.1586, Train Steps/Sec: 11.61, Grad Norm: 0.0310 +[2025-02-19 15:51:21] (step=0018800) Train Loss: 0.1600, Train Steps/Sec: 10.03, Grad Norm: 0.0316 +[2025-02-19 15:51:30] (step=0018900) Train Loss: 0.1598, Train Steps/Sec: 11.61, Grad Norm: 0.0308 +[2025-02-19 15:51:38] (step=0019000) Train Loss: 0.1570, Train Steps/Sec: 11.60, Grad Norm: 0.0301 +[2025-02-19 15:51:47] (step=0019100) Train Loss: 0.1580, Train Steps/Sec: 11.61, Grad Norm: 0.0293 +[2025-02-19 15:51:55] (step=0019200) Train Loss: 0.1580, Train Steps/Sec: 11.61, Grad Norm: 0.0308 +[2025-02-19 15:52:04] (step=0019300) Train Loss: 0.1591, Train Steps/Sec: 11.61, Grad Norm: 0.0317 +[2025-02-19 15:52:13] (step=0019400) Train Loss: 0.1602, Train Steps/Sec: 11.60, Grad Norm: 0.0284 +[2025-02-19 15:52:21] (step=0019500) Train Loss: 0.1582, Train Steps/Sec: 11.60, Grad Norm: 0.0288 +[2025-02-19 15:52:30] (step=0019600) Train Loss: 0.1599, Train Steps/Sec: 11.60, Grad Norm: 0.0310 +[2025-02-19 15:52:39] (step=0019700) Train Loss: 0.1575, Train Steps/Sec: 11.60, Grad Norm: 0.0310 +[2025-02-19 15:52:47] (step=0019800) Train Loss: 0.1593, Train Steps/Sec: 11.60, Grad Norm: 0.0313 +[2025-02-19 15:52:56] (step=0019900) Train Loss: 0.1570, Train Steps/Sec: 11.61, Grad Norm: 0.0301 +[2025-02-19 15:53:04] (step=0020000) Train Loss: 0.1588, Train Steps/Sec: 11.55, Grad Norm: 0.0321 +[2025-02-19 15:53:05] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0020000.pt +[2025-02-19 15:53:15] (step=0020100) Train Loss: 0.1589, Train Steps/Sec: 9.09, Grad Norm: 0.0304 +[2025-02-19 15:53:24] (step=0020200) Train Loss: 0.1589, Train Steps/Sec: 11.59, Grad Norm: 0.0280 +[2025-02-19 15:53:33] (step=0020300) Train Loss: 0.1588, Train Steps/Sec: 11.61, Grad Norm: 0.0298 +[2025-02-19 15:53:41] (step=0020400) Train Loss: 0.1603, Train Steps/Sec: 11.61, Grad Norm: 0.0283 +[2025-02-19 15:53:50] (step=0020500) Train Loss: 0.1583, Train Steps/Sec: 11.22, Grad Norm: 0.0272 +[2025-02-19 15:54:01] (step=0020600) Train Loss: 0.1588, Train Steps/Sec: 9.50, Grad Norm: 0.0280 +[2025-02-19 15:54:09] (step=0020700) Train Loss: 0.1583, Train Steps/Sec: 11.61, Grad Norm: 0.0308 +[2025-02-19 15:54:18] (step=0020800) Train Loss: 0.1572, Train Steps/Sec: 11.62, Grad Norm: 0.0285 +[2025-02-19 15:54:27] (step=0020900) Train Loss: 0.1574, Train Steps/Sec: 11.61, Grad Norm: 0.0270 +[2025-02-19 15:54:35] (step=0021000) Train Loss: 0.1587, Train Steps/Sec: 11.62, Grad Norm: 0.0288 +[2025-02-19 15:54:44] (step=0021100) Train Loss: 0.1587, Train Steps/Sec: 11.61, Grad Norm: 0.0304 +[2025-02-19 15:54:52] (step=0021200) Train Loss: 0.1573, Train Steps/Sec: 11.62, Grad Norm: 0.0292 +[2025-02-19 15:55:02] (step=0021300) Train Loss: 0.1575, Train Steps/Sec: 10.00, Grad Norm: 0.0267 +[2025-02-19 15:55:11] (step=0021400) Train Loss: 0.1573, Train Steps/Sec: 11.61, Grad Norm: 0.0270 +[2025-02-19 15:55:20] (step=0021500) Train Loss: 0.1588, Train Steps/Sec: 11.64, Grad Norm: 0.0291 +[2025-02-19 15:55:28] (step=0021600) Train Loss: 0.1583, Train Steps/Sec: 11.64, Grad Norm: 0.0267 +[2025-02-19 15:55:37] (step=0021700) Train Loss: 0.1580, Train Steps/Sec: 11.64, Grad Norm: 0.0276 +[2025-02-19 15:55:45] (step=0021800) Train Loss: 0.1580, Train Steps/Sec: 11.63, Grad Norm: 0.0263 +[2025-02-19 15:55:54] (step=0021900) Train Loss: 0.1577, Train Steps/Sec: 11.64, Grad Norm: 0.0265 +[2025-02-19 15:56:03] (step=0022000) Train Loss: 0.1585, Train Steps/Sec: 11.65, Grad Norm: 0.0259 +[2025-02-19 15:56:11] (step=0022100) Train Loss: 0.1574, Train Steps/Sec: 11.64, Grad Norm: 0.0277 +[2025-02-19 15:56:20] (step=0022200) Train Loss: 0.1578, Train Steps/Sec: 11.64, Grad Norm: 0.0263 +[2025-02-19 15:56:28] (step=0022300) Train Loss: 0.1580, Train Steps/Sec: 11.64, Grad Norm: 0.0279 +[2025-02-19 15:56:37] (step=0022400) Train Loss: 0.1579, Train Steps/Sec: 11.64, Grad Norm: 0.0257 +[2025-02-19 15:56:46] (step=0022500) Train Loss: 0.1576, Train Steps/Sec: 11.59, Grad Norm: 0.0242 +[2025-02-19 15:56:56] (step=0022600) Train Loss: 0.1576, Train Steps/Sec: 9.78, Grad Norm: 0.0260 +[2025-02-19 15:57:04] (step=0022700) Train Loss: 0.1573, Train Steps/Sec: 11.64, Grad Norm: 0.0260 +[2025-02-19 15:57:15] (step=0022800) Train Loss: 0.1581, Train Steps/Sec: 9.49, Grad Norm: 0.0262 +[2025-02-19 15:57:24] (step=0022900) Train Loss: 0.1572, Train Steps/Sec: 11.64, Grad Norm: 0.0259 +[2025-02-19 15:57:32] (step=0023000) Train Loss: 0.1573, Train Steps/Sec: 11.63, Grad Norm: 0.0263 +[2025-02-19 15:57:41] (step=0023100) Train Loss: 0.1574, Train Steps/Sec: 11.63, Grad Norm: 0.0247 +[2025-02-19 15:57:49] (step=0023200) Train Loss: 0.1575, Train Steps/Sec: 11.63, Grad Norm: 0.0246 +[2025-02-19 15:57:58] (step=0023300) Train Loss: 0.1580, Train Steps/Sec: 11.63, Grad Norm: 0.0265 +[2025-02-19 15:58:07] (step=0023400) Train Loss: 0.1569, Train Steps/Sec: 11.63, Grad Norm: 0.0249 +[2025-02-19 15:58:15] (step=0023500) Train Loss: 0.1569, Train Steps/Sec: 11.63, Grad Norm: 0.0242 +[2025-02-19 15:58:24] (step=0023600) Train Loss: 0.1568, Train Steps/Sec: 11.61, Grad Norm: 0.0252 +[2025-02-19 15:58:32] (step=0023700) Train Loss: 0.1589, Train Steps/Sec: 11.61, Grad Norm: 0.0243 +[2025-02-19 15:58:42] (step=0023800) Train Loss: 0.1577, Train Steps/Sec: 9.92, Grad Norm: 0.0243 +[2025-02-19 15:58:51] (step=0023900) Train Loss: 0.1575, Train Steps/Sec: 11.62, Grad Norm: 0.0244 +[2025-02-19 15:59:00] (step=0024000) Train Loss: 0.1573, Train Steps/Sec: 11.62, Grad Norm: 0.0254 +[2025-02-19 15:59:08] (step=0024100) Train Loss: 0.1578, Train Steps/Sec: 11.62, Grad Norm: 0.0236 +[2025-02-19 15:59:17] (step=0024200) Train Loss: 0.1569, Train Steps/Sec: 11.62, Grad Norm: 0.0246 +[2025-02-19 15:59:25] (step=0024300) Train Loss: 0.1584, Train Steps/Sec: 11.62, Grad Norm: 0.0240 +[2025-02-19 15:59:34] (step=0024400) Train Loss: 0.1565, Train Steps/Sec: 11.62, Grad Norm: 0.0235 +[2025-02-19 15:59:43] (step=0024500) Train Loss: 0.1566, Train Steps/Sec: 11.62, Grad Norm: 0.0247 +[2025-02-19 15:59:51] (step=0024600) Train Loss: 0.1580, Train Steps/Sec: 11.63, Grad Norm: 0.0239 +[2025-02-19 16:00:00] (step=0024700) Train Loss: 0.1572, Train Steps/Sec: 11.63, Grad Norm: 0.0220 +[2025-02-19 16:00:09] (step=0024800) Train Loss: 0.1568, Train Steps/Sec: 11.24, Grad Norm: 0.0223 +[2025-02-19 16:00:18] (step=0024900) Train Loss: 0.1581, Train Steps/Sec: 10.93, Grad Norm: 0.0236 +[2025-02-19 16:00:28] (step=0025000) Train Loss: 0.1574, Train Steps/Sec: 9.99, Grad Norm: 0.0227 +[2025-02-19 16:00:38] (step=0025100) Train Loss: 0.1563, Train Steps/Sec: 10.01, Grad Norm: 0.0219 +[2025-02-19 16:00:47] (step=0025200) Train Loss: 0.1568, Train Steps/Sec: 11.62, Grad Norm: 0.0226 +[2025-02-19 16:00:55] (step=0025300) Train Loss: 0.1563, Train Steps/Sec: 11.61, Grad Norm: 0.0226 +[2025-02-19 16:01:04] (step=0025400) Train Loss: 0.1568, Train Steps/Sec: 11.61, Grad Norm: 0.0235 +[2025-02-19 16:01:12] (step=0025500) Train Loss: 0.1572, Train Steps/Sec: 11.61, Grad Norm: 0.0224 +[2025-02-19 16:01:21] (step=0025600) Train Loss: 0.1575, Train Steps/Sec: 11.61, Grad Norm: 0.0225 +[2025-02-19 16:01:30] (step=0025700) Train Loss: 0.1572, Train Steps/Sec: 11.61, Grad Norm: 0.0222 +[2025-02-19 16:01:38] (step=0025800) Train Loss: 0.1567, Train Steps/Sec: 11.61, Grad Norm: 0.0223 +[2025-02-19 16:01:47] (step=0025900) Train Loss: 0.1564, Train Steps/Sec: 11.61, Grad Norm: 0.0218 +[2025-02-19 16:01:55] (step=0026000) Train Loss: 0.1575, Train Steps/Sec: 11.61, Grad Norm: 0.0214 +[2025-02-19 16:02:04] (step=0026100) Train Loss: 0.1561, Train Steps/Sec: 11.61, Grad Norm: 0.0215 +[2025-02-19 16:02:13] (step=0026200) Train Loss: 0.1580, Train Steps/Sec: 11.61, Grad Norm: 0.0215 +[2025-02-19 16:02:23] (step=0026300) Train Loss: 0.1573, Train Steps/Sec: 10.09, Grad Norm: 0.0211 +[2025-02-19 16:02:31] (step=0026400) Train Loss: 0.1581, Train Steps/Sec: 11.61, Grad Norm: 0.0220 +[2025-02-19 16:02:40] (step=0026500) Train Loss: 0.1564, Train Steps/Sec: 11.61, Grad Norm: 0.0216 +[2025-02-19 16:02:48] (step=0026600) Train Loss: 0.1577, Train Steps/Sec: 11.60, Grad Norm: 0.0209 +[2025-02-19 16:02:57] (step=0026700) Train Loss: 0.1568, Train Steps/Sec: 11.61, Grad Norm: 0.0204 +[2025-02-19 16:03:06] (step=0026800) Train Loss: 0.1575, Train Steps/Sec: 11.61, Grad Norm: 0.0210 +[2025-02-19 16:03:14] (step=0026900) Train Loss: 0.1576, Train Steps/Sec: 11.63, Grad Norm: 0.0206 +[2025-02-19 16:03:23] (step=0027000) Train Loss: 0.1560, Train Steps/Sec: 11.24, Grad Norm: 0.0216 +[2025-02-19 16:03:33] (step=0027100) Train Loss: 0.1568, Train Steps/Sec: 9.73, Grad Norm: 0.0215 +[2025-02-19 16:03:42] (step=0027200) Train Loss: 0.1565, Train Steps/Sec: 11.27, Grad Norm: 0.0216 +[2025-02-19 16:03:51] (step=0027300) Train Loss: 0.1582, Train Steps/Sec: 11.48, Grad Norm: 0.0205 +[2025-02-19 16:04:00] (step=0027400) Train Loss: 0.1571, Train Steps/Sec: 11.61, Grad Norm: 0.0192 +[2025-02-19 16:04:08] (step=0027500) Train Loss: 0.1571, Train Steps/Sec: 11.56, Grad Norm: 0.0213 +[2025-02-19 16:04:18] (step=0027600) Train Loss: 0.1575, Train Steps/Sec: 10.08, Grad Norm: 0.0200 +[2025-02-19 16:04:27] (step=0027700) Train Loss: 0.1555, Train Steps/Sec: 11.62, Grad Norm: 0.0193 +[2025-02-19 16:04:35] (step=0027800) Train Loss: 0.1590, Train Steps/Sec: 11.63, Grad Norm: 0.0201 +[2025-02-19 16:04:44] (step=0027900) Train Loss: 0.1560, Train Steps/Sec: 11.64, Grad Norm: 0.0198 +[2025-02-19 16:04:53] (step=0028000) Train Loss: 0.1566, Train Steps/Sec: 11.62, Grad Norm: 0.0199 +[2025-02-19 16:05:01] (step=0028100) Train Loss: 0.1575, Train Steps/Sec: 11.62, Grad Norm: 0.0190 +[2025-02-19 16:05:10] (step=0028200) Train Loss: 0.1561, Train Steps/Sec: 11.62, Grad Norm: 0.0201 +[2025-02-19 16:05:18] (step=0028300) Train Loss: 0.1572, Train Steps/Sec: 11.63, Grad Norm: 0.0194 +[2025-02-19 16:05:27] (step=0028400) Train Loss: 0.1566, Train Steps/Sec: 11.62, Grad Norm: 0.0200 +[2025-02-19 16:05:36] (step=0028500) Train Loss: 0.1558, Train Steps/Sec: 11.62, Grad Norm: 0.0196 +[2025-02-19 16:05:44] (step=0028600) Train Loss: 0.1557, Train Steps/Sec: 11.63, Grad Norm: 0.0201 +[2025-02-19 16:05:53] (step=0028700) Train Loss: 0.1561, Train Steps/Sec: 11.60, Grad Norm: 0.0189 +[2025-02-19 16:06:03] (step=0028800) Train Loss: 0.1566, Train Steps/Sec: 10.02, Grad Norm: 0.0193 +[2025-02-19 16:06:11] (step=0028900) Train Loss: 0.1573, Train Steps/Sec: 11.62, Grad Norm: 0.0196 +[2025-02-19 16:06:20] (step=0029000) Train Loss: 0.1570, Train Steps/Sec: 11.61, Grad Norm: 0.0191 +[2025-02-19 16:06:29] (step=0029100) Train Loss: 0.1564, Train Steps/Sec: 11.61, Grad Norm: 0.0193 +[2025-02-19 16:06:38] (step=0029200) Train Loss: 0.1563, Train Steps/Sec: 11.22, Grad Norm: 0.0189 +[2025-02-19 16:06:48] (step=0029300) Train Loss: 0.1580, Train Steps/Sec: 9.75, Grad Norm: 0.0190 +[2025-02-19 16:06:56] (step=0029400) Train Loss: 0.1548, Train Steps/Sec: 11.62, Grad Norm: 0.0190 +[2025-02-19 16:07:05] (step=0029500) Train Loss: 0.1572, Train Steps/Sec: 11.62, Grad Norm: 0.0198 +[2025-02-19 16:07:14] (step=0029600) Train Loss: 0.1561, Train Steps/Sec: 11.65, Grad Norm: 0.0181 +[2025-02-19 16:07:22] (step=0029700) Train Loss: 0.1567, Train Steps/Sec: 11.64, Grad Norm: 0.0177 +[2025-02-19 16:07:31] (step=0029800) Train Loss: 0.1564, Train Steps/Sec: 11.65, Grad Norm: 0.0181 +[2025-02-19 16:07:39] (step=0029900) Train Loss: 0.1547, Train Steps/Sec: 11.65, Grad Norm: 0.0180 +[2025-02-19 16:07:48] (step=0030000) Train Loss: 0.1565, Train Steps/Sec: 11.59, Grad Norm: 0.0186 +[2025-02-19 16:07:58] (step=0030100) Train Loss: 0.1572, Train Steps/Sec: 10.05, Grad Norm: 0.0182 +[2025-02-19 16:08:07] (step=0030200) Train Loss: 0.1561, Train Steps/Sec: 11.60, Grad Norm: 0.0175 +[2025-02-19 16:08:15] (step=0030300) Train Loss: 0.1563, Train Steps/Sec: 11.60, Grad Norm: 0.0183 +[2025-02-19 16:08:24] (step=0030400) Train Loss: 0.1555, Train Steps/Sec: 11.60, Grad Norm: 0.0182 +[2025-02-19 16:08:32] (step=0030500) Train Loss: 0.1578, Train Steps/Sec: 11.58, Grad Norm: 0.0174 +[2025-02-19 16:08:41] (step=0030600) Train Loss: 0.1569, Train Steps/Sec: 11.60, Grad Norm: 0.0190 +[2025-02-19 16:08:50] (step=0030700) Train Loss: 0.1566, Train Steps/Sec: 11.60, Grad Norm: 0.0176 +[2025-02-19 16:08:58] (step=0030800) Train Loss: 0.1546, Train Steps/Sec: 11.60, Grad Norm: 0.0172 +[2025-02-19 16:09:07] (step=0030900) Train Loss: 0.1560, Train Steps/Sec: 11.60, Grad Norm: 0.0175 +[2025-02-19 16:09:16] (step=0031000) Train Loss: 0.1578, Train Steps/Sec: 11.60, Grad Norm: 0.0171 +[2025-02-19 16:09:24] (step=0031100) Train Loss: 0.1564, Train Steps/Sec: 11.60, Grad Norm: 0.0177 +[2025-02-19 16:09:33] (step=0031200) Train Loss: 0.1571, Train Steps/Sec: 11.60, Grad Norm: 0.0170 +[2025-02-19 16:09:43] (step=0031300) Train Loss: 0.1542, Train Steps/Sec: 9.70, Grad Norm: 0.0177 +[2025-02-19 16:09:52] (step=0031400) Train Loss: 0.1558, Train Steps/Sec: 11.26, Grad Norm: 0.0165 +[2025-02-19 16:10:02] (step=0031500) Train Loss: 0.1565, Train Steps/Sec: 9.97, Grad Norm: 0.0169 +[2025-02-19 16:10:11] (step=0031600) Train Loss: 0.1553, Train Steps/Sec: 11.61, Grad Norm: 0.0172 +[2025-02-19 16:10:19] (step=0031700) Train Loss: 0.1559, Train Steps/Sec: 11.60, Grad Norm: 0.0163 +[2025-02-19 16:10:28] (step=0031800) Train Loss: 0.1562, Train Steps/Sec: 11.61, Grad Norm: 0.0172 +[2025-02-19 16:10:37] (step=0031900) Train Loss: 0.1558, Train Steps/Sec: 11.60, Grad Norm: 0.0168 +[2025-02-19 16:10:45] (step=0032000) Train Loss: 0.1552, Train Steps/Sec: 11.62, Grad Norm: 0.0164 +[2025-02-19 16:10:54] (step=0032100) Train Loss: 0.1561, Train Steps/Sec: 11.61, Grad Norm: 0.0162 +[2025-02-19 16:11:02] (step=0032200) Train Loss: 0.1553, Train Steps/Sec: 11.62, Grad Norm: 0.0165 +[2025-02-19 16:11:11] (step=0032300) Train Loss: 0.1560, Train Steps/Sec: 11.61, Grad Norm: 0.0169 +[2025-02-19 16:11:20] (step=0032400) Train Loss: 0.1560, Train Steps/Sec: 11.57, Grad Norm: 0.0166 +[2025-02-19 16:11:28] (step=0032500) Train Loss: 0.1566, Train Steps/Sec: 11.56, Grad Norm: 0.0160 +[2025-02-19 16:11:38] (step=0032600) Train Loss: 0.1550, Train Steps/Sec: 9.98, Grad Norm: 0.0163 +[2025-02-19 16:11:47] (step=0032700) Train Loss: 0.1563, Train Steps/Sec: 11.61, Grad Norm: 0.0165 +[2025-02-19 16:11:56] (step=0032800) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0164 +[2025-02-19 16:12:04] (step=0032900) Train Loss: 0.1569, Train Steps/Sec: 11.61, Grad Norm: 0.0154 +[2025-02-19 16:12:13] (step=0033000) Train Loss: 0.1559, Train Steps/Sec: 11.61, Grad Norm: 0.0152 +[2025-02-19 16:12:21] (step=0033100) Train Loss: 0.1557, Train Steps/Sec: 11.61, Grad Norm: 0.0165 +[2025-02-19 16:12:30] (step=0033200) Train Loss: 0.1559, Train Steps/Sec: 11.62, Grad Norm: 0.0159 +[2025-02-19 16:12:39] (step=0033300) Train Loss: 0.1567, Train Steps/Sec: 11.61, Grad Norm: 0.0159 +[2025-02-19 16:12:47] (step=0033400) Train Loss: 0.1561, Train Steps/Sec: 11.61, Grad Norm: 0.0155 +[2025-02-19 16:12:56] (step=0033500) Train Loss: 0.1553, Train Steps/Sec: 11.22, Grad Norm: 0.0159 +[2025-02-19 16:13:05] (step=0033600) Train Loss: 0.1556, Train Steps/Sec: 10.91, Grad Norm: 0.0160 +[2025-02-19 16:13:15] (step=0033700) Train Loss: 0.1568, Train Steps/Sec: 10.27, Grad Norm: 0.0150 +[2025-02-19 16:13:25] (step=0033800) Train Loss: 0.1554, Train Steps/Sec: 9.98, Grad Norm: 0.0151 +[2025-02-19 16:13:34] (step=0033900) Train Loss: 0.1544, Train Steps/Sec: 11.59, Grad Norm: 0.0152 +[2025-02-19 16:13:42] (step=0034000) Train Loss: 0.1550, Train Steps/Sec: 11.57, Grad Norm: 0.0158 +[2025-02-19 16:13:51] (step=0034100) Train Loss: 0.1558, Train Steps/Sec: 11.59, Grad Norm: 0.0150 +[2025-02-19 16:14:00] (step=0034200) Train Loss: 0.1559, Train Steps/Sec: 11.59, Grad Norm: 0.0152 +[2025-02-19 16:14:08] (step=0034300) Train Loss: 0.1570, Train Steps/Sec: 11.59, Grad Norm: 0.0152 +[2025-02-19 16:14:17] (step=0034400) Train Loss: 0.1561, Train Steps/Sec: 11.59, Grad Norm: 0.0147 +[2025-02-19 16:14:25] (step=0034500) Train Loss: 0.1548, Train Steps/Sec: 11.58, Grad Norm: 0.0150 +[2025-02-19 16:14:34] (step=0034600) Train Loss: 0.1556, Train Steps/Sec: 11.59, Grad Norm: 0.0149 +[2025-02-19 16:14:43] (step=0034700) Train Loss: 0.1565, Train Steps/Sec: 11.59, Grad Norm: 0.0146 +[2025-02-19 16:14:51] (step=0034800) Train Loss: 0.1562, Train Steps/Sec: 11.60, Grad Norm: 0.0154 +[2025-02-19 16:15:00] (step=0034900) Train Loss: 0.1552, Train Steps/Sec: 11.61, Grad Norm: 0.0145 +[2025-02-19 16:15:09] (step=0035000) Train Loss: 0.1544, Train Steps/Sec: 11.55, Grad Norm: 0.0146 +[2025-02-19 16:15:19] (step=0035100) Train Loss: 0.1546, Train Steps/Sec: 10.02, Grad Norm: 0.0152 +[2025-02-19 16:15:27] (step=0035200) Train Loss: 0.1555, Train Steps/Sec: 11.60, Grad Norm: 0.0146 +[2025-02-19 16:15:36] (step=0035300) Train Loss: 0.1558, Train Steps/Sec: 11.60, Grad Norm: 0.0142 +[2025-02-19 16:15:44] (step=0035400) Train Loss: 0.1556, Train Steps/Sec: 11.59, Grad Norm: 0.0146 +[2025-02-19 16:15:53] (step=0035500) Train Loss: 0.1564, Train Steps/Sec: 11.59, Grad Norm: 0.0147 +[2025-02-19 16:16:02] (step=0035600) Train Loss: 0.1556, Train Steps/Sec: 11.60, Grad Norm: 0.0146 +[2025-02-19 16:16:11] (step=0035700) Train Loss: 0.1554, Train Steps/Sec: 11.20, Grad Norm: 0.0144 +[2025-02-19 16:16:21] (step=0035800) Train Loss: 0.1547, Train Steps/Sec: 9.72, Grad Norm: 0.0140 +[2025-02-19 16:16:30] (step=0035900) Train Loss: 0.1557, Train Steps/Sec: 11.26, Grad Norm: 0.0150 +[2025-02-19 16:16:38] (step=0036000) Train Loss: 0.1555, Train Steps/Sec: 11.60, Grad Norm: 0.0141 +[2025-02-19 16:16:47] (step=0036100) Train Loss: 0.1566, Train Steps/Sec: 11.60, Grad Norm: 0.0143 +[2025-02-19 16:16:56] (step=0036200) Train Loss: 0.1560, Train Steps/Sec: 11.60, Grad Norm: 0.0143 +[2025-02-19 16:17:06] (step=0036300) Train Loss: 0.1554, Train Steps/Sec: 9.97, Grad Norm: 0.0138 +[2025-02-19 16:17:14] (step=0036400) Train Loss: 0.1565, Train Steps/Sec: 11.61, Grad Norm: 0.0137 +[2025-02-19 16:17:23] (step=0036500) Train Loss: 0.1561, Train Steps/Sec: 11.61, Grad Norm: 0.0142 +[2025-02-19 16:17:32] (step=0036600) Train Loss: 0.1558, Train Steps/Sec: 11.61, Grad Norm: 0.0139 +[2025-02-19 16:17:40] (step=0036700) Train Loss: 0.1551, Train Steps/Sec: 11.60, Grad Norm: 0.0139 +[2025-02-19 16:17:49] (step=0036800) Train Loss: 0.1561, Train Steps/Sec: 11.62, Grad Norm: 0.0137 +[2025-02-19 16:17:57] (step=0036900) Train Loss: 0.1553, Train Steps/Sec: 11.61, Grad Norm: 0.0140 +[2025-02-19 16:18:06] (step=0037000) Train Loss: 0.1549, Train Steps/Sec: 11.60, Grad Norm: 0.0134 +[2025-02-19 16:18:15] (step=0037100) Train Loss: 0.1562, Train Steps/Sec: 11.60, Grad Norm: 0.0135 +[2025-02-19 16:18:23] (step=0037200) Train Loss: 0.1558, Train Steps/Sec: 11.60, Grad Norm: 0.0137 +[2025-02-19 16:18:32] (step=0037300) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0133 +[2025-02-19 16:18:41] (step=0037400) Train Loss: 0.1564, Train Steps/Sec: 11.60, Grad Norm: 0.0134 +[2025-02-19 16:18:49] (step=0037500) Train Loss: 0.1567, Train Steps/Sec: 11.56, Grad Norm: 0.0139 +[2025-02-19 16:18:59] (step=0037600) Train Loss: 0.1552, Train Steps/Sec: 10.06, Grad Norm: 0.0134 +[2025-02-19 16:19:08] (step=0037700) Train Loss: 0.1552, Train Steps/Sec: 11.59, Grad Norm: 0.0133 +[2025-02-19 16:19:16] (step=0037800) Train Loss: 0.1563, Train Steps/Sec: 11.57, Grad Norm: 0.0136 +[2025-02-19 16:19:25] (step=0037900) Train Loss: 0.1546, Train Steps/Sec: 11.21, Grad Norm: 0.0141 +[2025-02-19 16:19:36] (step=0038000) Train Loss: 0.1555, Train Steps/Sec: 9.46, Grad Norm: 0.0134 +[2025-02-19 16:19:45] (step=0038100) Train Loss: 0.1547, Train Steps/Sec: 11.59, Grad Norm: 0.0135 +[2025-02-19 16:19:53] (step=0038200) Train Loss: 0.1549, Train Steps/Sec: 11.59, Grad Norm: 0.0130 +[2025-02-19 16:20:02] (step=0038300) Train Loss: 0.1552, Train Steps/Sec: 11.58, Grad Norm: 0.0131 +[2025-02-19 16:20:10] (step=0038400) Train Loss: 0.1555, Train Steps/Sec: 11.59, Grad Norm: 0.0134 +[2025-02-19 16:20:19] (step=0038500) Train Loss: 0.1543, Train Steps/Sec: 11.59, Grad Norm: 0.0133 +[2025-02-19 16:20:28] (step=0038600) Train Loss: 0.1556, Train Steps/Sec: 11.60, Grad Norm: 0.0126 +[2025-02-19 16:20:36] (step=0038700) Train Loss: 0.1547, Train Steps/Sec: 11.59, Grad Norm: 0.0129 +[2025-02-19 16:20:46] (step=0038800) Train Loss: 0.1562, Train Steps/Sec: 10.00, Grad Norm: 0.0127 +[2025-02-19 16:20:55] (step=0038900) Train Loss: 0.1557, Train Steps/Sec: 11.58, Grad Norm: 0.0131 +[2025-02-19 16:21:04] (step=0039000) Train Loss: 0.1558, Train Steps/Sec: 11.58, Grad Norm: 0.0135 +[2025-02-19 16:21:12] (step=0039100) Train Loss: 0.1552, Train Steps/Sec: 11.59, Grad Norm: 0.0132 +[2025-02-19 16:21:21] (step=0039200) Train Loss: 0.1548, Train Steps/Sec: 11.59, Grad Norm: 0.0128 +[2025-02-19 16:21:29] (step=0039300) Train Loss: 0.1550, Train Steps/Sec: 11.59, Grad Norm: 0.0133 +[2025-02-19 16:21:38] (step=0039400) Train Loss: 0.1540, Train Steps/Sec: 11.59, Grad Norm: 0.0125 +[2025-02-19 16:21:47] (step=0039500) Train Loss: 0.1555, Train Steps/Sec: 11.60, Grad Norm: 0.0125 +[2025-02-19 16:21:55] (step=0039600) Train Loss: 0.1544, Train Steps/Sec: 11.60, Grad Norm: 0.0131 +[2025-02-19 16:22:04] (step=0039700) Train Loss: 0.1553, Train Steps/Sec: 11.59, Grad Norm: 0.0129 +[2025-02-19 16:22:13] (step=0039800) Train Loss: 0.1563, Train Steps/Sec: 11.60, Grad Norm: 0.0128 +[2025-02-19 16:22:21] (step=0039900) Train Loss: 0.1542, Train Steps/Sec: 11.61, Grad Norm: 0.0124 +[2025-02-19 16:22:30] (step=0040000) Train Loss: 0.1556, Train Steps/Sec: 11.55, Grad Norm: 0.0130 +[2025-02-19 16:22:31] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0040000.pt +[2025-02-19 16:22:41] (step=0040100) Train Loss: 0.1531, Train Steps/Sec: 8.65, Grad Norm: 0.0124 +[2025-02-19 16:22:51] (step=0040200) Train Loss: 0.1549, Train Steps/Sec: 10.01, Grad Norm: 0.0126 +[2025-02-19 16:23:00] (step=0040300) Train Loss: 0.1538, Train Steps/Sec: 11.62, Grad Norm: 0.0127 +[2025-02-19 16:23:09] (step=0040400) Train Loss: 0.1542, Train Steps/Sec: 11.62, Grad Norm: 0.0125 +[2025-02-19 16:23:17] (step=0040500) Train Loss: 0.1564, Train Steps/Sec: 11.60, Grad Norm: 0.0126 +[2025-02-19 16:23:26] (step=0040600) Train Loss: 0.1571, Train Steps/Sec: 11.60, Grad Norm: 0.0122 +[2025-02-19 16:23:34] (step=0040700) Train Loss: 0.1547, Train Steps/Sec: 11.60, Grad Norm: 0.0122 +[2025-02-19 16:23:43] (step=0040800) Train Loss: 0.1560, Train Steps/Sec: 11.61, Grad Norm: 0.0125 +[2025-02-19 16:23:52] (step=0040900) Train Loss: 0.1552, Train Steps/Sec: 11.59, Grad Norm: 0.0124 +[2025-02-19 16:24:00] (step=0041000) Train Loss: 0.1533, Train Steps/Sec: 11.60, Grad Norm: 0.0128 +[2025-02-19 16:24:09] (step=0041100) Train Loss: 0.1555, Train Steps/Sec: 11.61, Grad Norm: 0.0121 +[2025-02-19 16:24:18] (step=0041200) Train Loss: 0.1555, Train Steps/Sec: 11.60, Grad Norm: 0.0119 +[2025-02-19 16:24:28] (step=0041300) Train Loss: 0.1542, Train Steps/Sec: 10.00, Grad Norm: 0.0124 +[2025-02-19 16:24:36] (step=0041400) Train Loss: 0.1547, Train Steps/Sec: 11.61, Grad Norm: 0.0125 +[2025-02-19 16:24:45] (step=0041500) Train Loss: 0.1538, Train Steps/Sec: 11.62, Grad Norm: 0.0122 +[2025-02-19 16:24:53] (step=0041600) Train Loss: 0.1564, Train Steps/Sec: 11.61, Grad Norm: 0.0121 +[2025-02-19 16:25:02] (step=0041700) Train Loss: 0.1542, Train Steps/Sec: 11.61, Grad Norm: 0.0123 +[2025-02-19 16:25:11] (step=0041800) Train Loss: 0.1565, Train Steps/Sec: 11.61, Grad Norm: 0.0122 +[2025-02-19 16:25:19] (step=0041900) Train Loss: 0.1553, Train Steps/Sec: 11.61, Grad Norm: 0.0119 +[2025-02-19 16:25:28] (step=0042000) Train Loss: 0.1545, Train Steps/Sec: 11.61, Grad Norm: 0.0120 +[2025-02-19 16:25:36] (step=0042100) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0122 +[2025-02-19 16:25:45] (step=0042200) Train Loss: 0.1541, Train Steps/Sec: 11.23, Grad Norm: 0.0121 +[2025-02-19 16:25:55] (step=0042300) Train Loss: 0.1548, Train Steps/Sec: 10.57, Grad Norm: 0.0124 +[2025-02-19 16:26:05] (step=0042400) Train Loss: 0.1541, Train Steps/Sec: 10.31, Grad Norm: 0.0119 +[2025-02-19 16:26:13] (step=0042500) Train Loss: 0.1549, Train Steps/Sec: 11.61, Grad Norm: 0.0119 +[2025-02-19 16:26:23] (step=0042600) Train Loss: 0.1551, Train Steps/Sec: 9.95, Grad Norm: 0.0119 +[2025-02-19 16:26:32] (step=0042700) Train Loss: 0.1552, Train Steps/Sec: 11.59, Grad Norm: 0.0118 +[2025-02-19 16:26:40] (step=0042800) Train Loss: 0.1550, Train Steps/Sec: 11.59, Grad Norm: 0.0120 +[2025-02-19 16:26:49] (step=0042900) Train Loss: 0.1560, Train Steps/Sec: 11.58, Grad Norm: 0.0118 +[2025-02-19 16:26:58] (step=0043000) Train Loss: 0.1553, Train Steps/Sec: 11.58, Grad Norm: 0.0121 +[2025-02-19 16:27:06] (step=0043100) Train Loss: 0.1546, Train Steps/Sec: 11.58, Grad Norm: 0.0125 +[2025-02-19 16:27:15] (step=0043200) Train Loss: 0.1558, Train Steps/Sec: 11.61, Grad Norm: 0.0115 +[2025-02-19 16:27:24] (step=0043300) Train Loss: 0.1547, Train Steps/Sec: 11.61, Grad Norm: 0.0125 +[2025-02-19 16:27:32] (step=0043400) Train Loss: 0.1551, Train Steps/Sec: 11.60, Grad Norm: 0.0119 +[2025-02-19 16:27:41] (step=0043500) Train Loss: 0.1552, Train Steps/Sec: 11.61, Grad Norm: 0.0116 +[2025-02-19 16:27:49] (step=0043600) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0116 +[2025-02-19 16:27:58] (step=0043700) Train Loss: 0.1547, Train Steps/Sec: 11.52, Grad Norm: 0.0123 +[2025-02-19 16:28:08] (step=0043800) Train Loss: 0.1548, Train Steps/Sec: 10.01, Grad Norm: 0.0117 +[2025-02-19 16:28:17] (step=0043900) Train Loss: 0.1549, Train Steps/Sec: 11.56, Grad Norm: 0.0120 +[2025-02-19 16:28:25] (step=0044000) Train Loss: 0.1540, Train Steps/Sec: 11.55, Grad Norm: 0.0117 +[2025-02-19 16:28:34] (step=0044100) Train Loss: 0.1547, Train Steps/Sec: 11.55, Grad Norm: 0.0122 +[2025-02-19 16:28:43] (step=0044200) Train Loss: 0.1540, Train Steps/Sec: 11.56, Grad Norm: 0.0116 +[2025-02-19 16:28:51] (step=0044300) Train Loss: 0.1544, Train Steps/Sec: 11.54, Grad Norm: 0.0122 +[2025-02-19 16:29:00] (step=0044400) Train Loss: 0.1559, Train Steps/Sec: 11.15, Grad Norm: 0.0116 +[2025-02-19 16:29:11] (step=0044500) Train Loss: 0.1543, Train Steps/Sec: 9.71, Grad Norm: 0.0113 +[2025-02-19 16:29:20] (step=0044600) Train Loss: 0.1533, Train Steps/Sec: 11.25, Grad Norm: 0.0115 +[2025-02-19 16:29:28] (step=0044700) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0120 +[2025-02-19 16:29:37] (step=0044800) Train Loss: 0.1555, Train Steps/Sec: 11.60, Grad Norm: 0.0118 +[2025-02-19 16:29:45] (step=0044900) Train Loss: 0.1541, Train Steps/Sec: 11.60, Grad Norm: 0.0112 +[2025-02-19 16:29:54] (step=0045000) Train Loss: 0.1543, Train Steps/Sec: 11.60, Grad Norm: 0.0122 +[2025-02-19 16:30:04] (step=0045100) Train Loss: 0.1563, Train Steps/Sec: 10.00, Grad Norm: 0.0118 +[2025-02-19 16:30:13] (step=0045200) Train Loss: 0.1549, Train Steps/Sec: 11.60, Grad Norm: 0.0114 +[2025-02-19 16:30:21] (step=0045300) Train Loss: 0.1542, Train Steps/Sec: 11.61, Grad Norm: 0.0118 +[2025-02-19 16:30:30] (step=0045400) Train Loss: 0.1542, Train Steps/Sec: 11.60, Grad Norm: 0.0115 +[2025-02-19 16:30:39] (step=0045500) Train Loss: 0.1556, Train Steps/Sec: 11.60, Grad Norm: 0.0113 +[2025-02-19 16:30:47] (step=0045600) Train Loss: 0.1554, Train Steps/Sec: 11.60, Grad Norm: 0.0114 +[2025-02-19 16:30:56] (step=0045700) Train Loss: 0.1535, Train Steps/Sec: 11.61, Grad Norm: 0.0115 +[2025-02-19 16:31:04] (step=0045800) Train Loss: 0.1552, Train Steps/Sec: 11.60, Grad Norm: 0.0119 +[2025-02-19 16:31:13] (step=0045900) Train Loss: 0.1548, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:31:22] (step=0046000) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0112 +[2025-02-19 16:31:30] (step=0046100) Train Loss: 0.1541, Train Steps/Sec: 11.61, Grad Norm: 0.0111 +[2025-02-19 16:31:39] (step=0046200) Train Loss: 0.1539, Train Steps/Sec: 11.61, Grad Norm: 0.0115 +[2025-02-19 16:31:49] (step=0046300) Train Loss: 0.1531, Train Steps/Sec: 10.00, Grad Norm: 0.0115 +[2025-02-19 16:31:58] (step=0046400) Train Loss: 0.1541, Train Steps/Sec: 11.58, Grad Norm: 0.0117 +[2025-02-19 16:32:06] (step=0046500) Train Loss: 0.1550, Train Steps/Sec: 11.59, Grad Norm: 0.0112 +[2025-02-19 16:32:15] (step=0046600) Train Loss: 0.1545, Train Steps/Sec: 10.88, Grad Norm: 0.0113 +[2025-02-19 16:32:26] (step=0046700) Train Loss: 0.1555, Train Steps/Sec: 9.72, Grad Norm: 0.0115 +[2025-02-19 16:32:34] (step=0046800) Train Loss: 0.1541, Train Steps/Sec: 11.60, Grad Norm: 0.0113 +[2025-02-19 16:32:43] (step=0046900) Train Loss: 0.1545, Train Steps/Sec: 11.62, Grad Norm: 0.0113 +[2025-02-19 16:32:51] (step=0047000) Train Loss: 0.1537, Train Steps/Sec: 11.62, Grad Norm: 0.0116 +[2025-02-19 16:33:00] (step=0047100) Train Loss: 0.1547, Train Steps/Sec: 11.62, Grad Norm: 0.0115 +[2025-02-19 16:33:09] (step=0047200) Train Loss: 0.1545, Train Steps/Sec: 11.62, Grad Norm: 0.0114 +[2025-02-19 16:33:17] (step=0047300) Train Loss: 0.1547, Train Steps/Sec: 11.61, Grad Norm: 0.0109 +[2025-02-19 16:33:26] (step=0047400) Train Loss: 0.1539, Train Steps/Sec: 11.61, Grad Norm: 0.0111 +[2025-02-19 16:33:35] (step=0047500) Train Loss: 0.1549, Train Steps/Sec: 11.60, Grad Norm: 0.0114 +[2025-02-19 16:33:45] (step=0047600) Train Loss: 0.1543, Train Steps/Sec: 9.96, Grad Norm: 0.0113 +[2025-02-19 16:33:53] (step=0047700) Train Loss: 0.1559, Train Steps/Sec: 11.60, Grad Norm: 0.0115 +[2025-02-19 16:34:02] (step=0047800) Train Loss: 0.1540, Train Steps/Sec: 11.61, Grad Norm: 0.0112 +[2025-02-19 16:34:10] (step=0047900) Train Loss: 0.1538, Train Steps/Sec: 11.62, Grad Norm: 0.0110 +[2025-02-19 16:34:19] (step=0048000) Train Loss: 0.1544, Train Steps/Sec: 11.61, Grad Norm: 0.0117 +[2025-02-19 16:34:28] (step=0048100) Train Loss: 0.1556, Train Steps/Sec: 11.59, Grad Norm: 0.0115 +[2025-02-19 16:34:36] (step=0048200) Train Loss: 0.1552, Train Steps/Sec: 11.59, Grad Norm: 0.0112 +[2025-02-19 16:34:45] (step=0048300) Train Loss: 0.1553, Train Steps/Sec: 11.62, Grad Norm: 0.0116 +[2025-02-19 16:34:54] (step=0048400) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0115 +[2025-02-19 16:35:02] (step=0048500) Train Loss: 0.1544, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:35:11] (step=0048600) Train Loss: 0.1544, Train Steps/Sec: 11.61, Grad Norm: 0.0112 +[2025-02-19 16:35:19] (step=0048700) Train Loss: 0.1541, Train Steps/Sec: 11.61, Grad Norm: 0.0112 +[2025-02-19 16:35:30] (step=0048800) Train Loss: 0.1548, Train Steps/Sec: 9.23, Grad Norm: 0.0116 +[2025-02-19 16:35:40] (step=0048900) Train Loss: 0.1555, Train Steps/Sec: 9.97, Grad Norm: 0.0109 +[2025-02-19 16:35:49] (step=0049000) Train Loss: 0.1549, Train Steps/Sec: 11.59, Grad Norm: 0.0112 +[2025-02-19 16:35:57] (step=0049100) Train Loss: 0.1546, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:36:06] (step=0049200) Train Loss: 0.1542, Train Steps/Sec: 11.59, Grad Norm: 0.0109 +[2025-02-19 16:36:15] (step=0049300) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0111 +[2025-02-19 16:36:23] (step=0049400) Train Loss: 0.1541, Train Steps/Sec: 11.57, Grad Norm: 0.0112 +[2025-02-19 16:36:32] (step=0049500) Train Loss: 0.1534, Train Steps/Sec: 11.62, Grad Norm: 0.0106 +[2025-02-19 16:36:41] (step=0049600) Train Loss: 0.1550, Train Steps/Sec: 11.61, Grad Norm: 0.0109 +[2025-02-19 16:36:49] (step=0049700) Train Loss: 0.1547, Train Steps/Sec: 11.62, Grad Norm: 0.0110 +[2025-02-19 16:36:58] (step=0049800) Train Loss: 0.1533, Train Steps/Sec: 11.60, Grad Norm: 0.0109 +[2025-02-19 16:37:06] (step=0049900) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:37:15] (step=0050000) Train Loss: 0.1542, Train Steps/Sec: 11.60, Grad Norm: 0.0111 +[2025-02-19 16:37:25] (step=0050100) Train Loss: 0.1539, Train Steps/Sec: 9.78, Grad Norm: 0.0111 +[2025-02-19 16:37:34] (step=0050200) Train Loss: 0.1549, Train Steps/Sec: 11.60, Grad Norm: 0.0109 +[2025-02-19 16:37:43] (step=0050300) Train Loss: 0.1544, Train Steps/Sec: 11.60, Grad Norm: 0.0108 +[2025-02-19 16:37:51] (step=0050400) Train Loss: 0.1542, Train Steps/Sec: 11.62, Grad Norm: 0.0111 +[2025-02-19 16:38:00] (step=0050500) Train Loss: 0.1540, Train Steps/Sec: 11.62, Grad Norm: 0.0107 +[2025-02-19 16:38:08] (step=0050600) Train Loss: 0.1538, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:38:17] (step=0050700) Train Loss: 0.1544, Train Steps/Sec: 11.60, Grad Norm: 0.0108 +[2025-02-19 16:38:26] (step=0050800) Train Loss: 0.1543, Train Steps/Sec: 11.60, Grad Norm: 0.0109 +[2025-02-19 16:38:35] (step=0050900) Train Loss: 0.1537, Train Steps/Sec: 11.22, Grad Norm: 0.0110 +[2025-02-19 16:38:44] (step=0051000) Train Loss: 0.1537, Train Steps/Sec: 10.23, Grad Norm: 0.0109 +[2025-02-19 16:38:54] (step=0051100) Train Loss: 0.1530, Train Steps/Sec: 10.60, Grad Norm: 0.0109 +[2025-02-19 16:39:02] (step=0051200) Train Loss: 0.1535, Train Steps/Sec: 11.57, Grad Norm: 0.0109 +[2025-02-19 16:39:12] (step=0051300) Train Loss: 0.1549, Train Steps/Sec: 9.98, Grad Norm: 0.0109 +[2025-02-19 16:39:21] (step=0051400) Train Loss: 0.1548, Train Steps/Sec: 11.60, Grad Norm: 0.0113 +[2025-02-19 16:39:30] (step=0051500) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0106 +[2025-02-19 16:39:38] (step=0051600) Train Loss: 0.1542, Train Steps/Sec: 11.60, Grad Norm: 0.0106 +[2025-02-19 16:39:47] (step=0051700) Train Loss: 0.1538, Train Steps/Sec: 11.59, Grad Norm: 0.0110 +[2025-02-19 16:39:55] (step=0051800) Train Loss: 0.1535, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:40:04] (step=0051900) Train Loss: 0.1537, Train Steps/Sec: 11.60, Grad Norm: 0.0106 +[2025-02-19 16:40:13] (step=0052000) Train Loss: 0.1543, Train Steps/Sec: 11.60, Grad Norm: 0.0110 +[2025-02-19 16:40:21] (step=0052100) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0107 +[2025-02-19 16:40:30] (step=0052200) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0105 +[2025-02-19 16:40:39] (step=0052300) Train Loss: 0.1538, Train Steps/Sec: 11.59, Grad Norm: 0.0108 +[2025-02-19 16:40:47] (step=0052400) Train Loss: 0.1554, Train Steps/Sec: 11.59, Grad Norm: 0.0107 +[2025-02-19 16:40:56] (step=0052500) Train Loss: 0.1526, Train Steps/Sec: 11.60, Grad Norm: 0.0111 +[2025-02-19 16:41:06] (step=0052600) Train Loss: 0.1527, Train Steps/Sec: 9.99, Grad Norm: 0.0108 +[2025-02-19 16:41:14] (step=0052700) Train Loss: 0.1546, Train Steps/Sec: 11.60, Grad Norm: 0.0108 +[2025-02-19 16:41:23] (step=0052800) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0108 +[2025-02-19 16:41:32] (step=0052900) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0107 +[2025-02-19 16:41:40] (step=0053000) Train Loss: 0.1554, Train Steps/Sec: 11.61, Grad Norm: 0.0112 +[2025-02-19 16:41:49] (step=0053100) Train Loss: 0.1532, Train Steps/Sec: 11.22, Grad Norm: 0.0106 +[2025-02-19 16:42:00] (step=0053200) Train Loss: 0.1544, Train Steps/Sec: 9.73, Grad Norm: 0.0106 +[2025-02-19 16:42:08] (step=0053300) Train Loss: 0.1539, Train Steps/Sec: 11.26, Grad Norm: 0.0107 +[2025-02-19 16:42:17] (step=0053400) Train Loss: 0.1544, Train Steps/Sec: 11.61, Grad Norm: 0.0112 +[2025-02-19 16:42:26] (step=0053500) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0110 +[2025-02-19 16:42:34] (step=0053600) Train Loss: 0.1539, Train Steps/Sec: 11.61, Grad Norm: 0.0104 +[2025-02-19 16:42:43] (step=0053700) Train Loss: 0.1550, Train Steps/Sec: 11.61, Grad Norm: 0.0106 +[2025-02-19 16:42:53] (step=0053800) Train Loss: 0.1523, Train Steps/Sec: 9.94, Grad Norm: 0.0105 +[2025-02-19 16:43:02] (step=0053900) Train Loss: 0.1543, Train Steps/Sec: 11.59, Grad Norm: 0.0112 +[2025-02-19 16:43:10] (step=0054000) Train Loss: 0.1549, Train Steps/Sec: 11.59, Grad Norm: 0.0106 +[2025-02-19 16:43:19] (step=0054100) Train Loss: 0.1538, Train Steps/Sec: 11.60, Grad Norm: 0.0103 +[2025-02-19 16:43:27] (step=0054200) Train Loss: 0.1535, Train Steps/Sec: 11.59, Grad Norm: 0.0107 +[2025-02-19 16:43:36] (step=0054300) Train Loss: 0.1543, Train Steps/Sec: 11.59, Grad Norm: 0.0108 +[2025-02-19 16:43:45] (step=0054400) Train Loss: 0.1551, Train Steps/Sec: 11.58, Grad Norm: 0.0111 +[2025-02-19 16:43:53] (step=0054500) Train Loss: 0.1536, Train Steps/Sec: 11.58, Grad Norm: 0.0108 +[2025-02-19 16:44:02] (step=0054600) Train Loss: 0.1555, Train Steps/Sec: 11.57, Grad Norm: 0.0107 +[2025-02-19 16:44:11] (step=0054700) Train Loss: 0.1554, Train Steps/Sec: 11.58, Grad Norm: 0.0101 +[2025-02-19 16:44:19] (step=0054800) Train Loss: 0.1538, Train Steps/Sec: 11.58, Grad Norm: 0.0106 +[2025-02-19 16:44:28] (step=0054900) Train Loss: 0.1546, Train Steps/Sec: 11.59, Grad Norm: 0.0112 +[2025-02-19 16:44:37] (step=0055000) Train Loss: 0.1545, Train Steps/Sec: 11.58, Grad Norm: 0.0108 +[2025-02-19 16:44:47] (step=0055100) Train Loss: 0.1537, Train Steps/Sec: 10.03, Grad Norm: 0.0109 +[2025-02-19 16:44:55] (step=0055200) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0106 +[2025-02-19 16:45:04] (step=0055300) Train Loss: 0.1528, Train Steps/Sec: 10.89, Grad Norm: 0.0108 +[2025-02-19 16:45:15] (step=0055400) Train Loss: 0.1555, Train Steps/Sec: 9.72, Grad Norm: 0.0105 +[2025-02-19 16:45:23] (step=0055500) Train Loss: 0.1535, Train Steps/Sec: 11.60, Grad Norm: 0.0105 +[2025-02-19 16:45:32] (step=0055600) Train Loss: 0.1541, Train Steps/Sec: 11.62, Grad Norm: 0.0109 +[2025-02-19 16:45:40] (step=0055700) Train Loss: 0.1547, Train Steps/Sec: 11.61, Grad Norm: 0.0105 +[2025-02-19 16:45:49] (step=0055800) Train Loss: 0.1556, Train Steps/Sec: 11.61, Grad Norm: 0.0102 +[2025-02-19 16:45:58] (step=0055900) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0108 +[2025-02-19 16:46:06] (step=0056000) Train Loss: 0.1549, Train Steps/Sec: 11.61, Grad Norm: 0.0106 +[2025-02-19 16:46:15] (step=0056100) Train Loss: 0.1529, Train Steps/Sec: 11.62, Grad Norm: 0.0100 +[2025-02-19 16:46:24] (step=0056200) Train Loss: 0.1550, Train Steps/Sec: 11.62, Grad Norm: 0.0105 +[2025-02-19 16:46:33] (step=0056300) Train Loss: 0.1537, Train Steps/Sec: 10.03, Grad Norm: 0.0105 +[2025-02-19 16:46:42] (step=0056400) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0105 +[2025-02-19 16:46:51] (step=0056500) Train Loss: 0.1533, Train Steps/Sec: 11.63, Grad Norm: 0.0110 +[2025-02-19 16:46:59] (step=0056600) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0106 +[2025-02-19 16:47:08] (step=0056700) Train Loss: 0.1541, Train Steps/Sec: 11.62, Grad Norm: 0.0103 +[2025-02-19 16:47:17] (step=0056800) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0105 +[2025-02-19 16:47:25] (step=0056900) Train Loss: 0.1531, Train Steps/Sec: 11.62, Grad Norm: 0.0106 +[2025-02-19 16:47:34] (step=0057000) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0105 +[2025-02-19 16:47:42] (step=0057100) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0105 +[2025-02-19 16:47:51] (step=0057200) Train Loss: 0.1550, Train Steps/Sec: 11.61, Grad Norm: 0.0105 +[2025-02-19 16:48:00] (step=0057300) Train Loss: 0.1542, Train Steps/Sec: 11.63, Grad Norm: 0.0105 +[2025-02-19 16:48:08] (step=0057400) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0102 +[2025-02-19 16:48:17] (step=0057500) Train Loss: 0.1550, Train Steps/Sec: 10.88, Grad Norm: 0.0106 +[2025-02-19 16:48:29] (step=0057600) Train Loss: 0.1534, Train Steps/Sec: 8.59, Grad Norm: 0.0101 +[2025-02-19 16:48:38] (step=0057700) Train Loss: 0.1542, Train Steps/Sec: 11.61, Grad Norm: 0.0103 +[2025-02-19 16:48:46] (step=0057800) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0105 +[2025-02-19 16:48:55] (step=0057900) Train Loss: 0.1558, Train Steps/Sec: 11.61, Grad Norm: 0.0104 +[2025-02-19 16:49:03] (step=0058000) Train Loss: 0.1529, Train Steps/Sec: 11.59, Grad Norm: 0.0105 +[2025-02-19 16:49:12] (step=0058100) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0102 +[2025-02-19 16:49:21] (step=0058200) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0101 +[2025-02-19 16:49:29] (step=0058300) Train Loss: 0.1544, Train Steps/Sec: 11.60, Grad Norm: 0.0106 +[2025-02-19 16:49:38] (step=0058400) Train Loss: 0.1542, Train Steps/Sec: 11.60, Grad Norm: 0.0104 +[2025-02-19 16:49:47] (step=0058500) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0104 +[2025-02-19 16:49:55] (step=0058600) Train Loss: 0.1541, Train Steps/Sec: 11.61, Grad Norm: 0.0104 +[2025-02-19 16:50:04] (step=0058700) Train Loss: 0.1543, Train Steps/Sec: 11.60, Grad Norm: 0.0106 +[2025-02-19 16:50:14] (step=0058800) Train Loss: 0.1546, Train Steps/Sec: 9.99, Grad Norm: 0.0105 +[2025-02-19 16:50:22] (step=0058900) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0104 +[2025-02-19 16:50:31] (step=0059000) Train Loss: 0.1529, Train Steps/Sec: 11.62, Grad Norm: 0.0105 +[2025-02-19 16:50:40] (step=0059100) Train Loss: 0.1555, Train Steps/Sec: 11.62, Grad Norm: 0.0104 +[2025-02-19 16:50:48] (step=0059200) Train Loss: 0.1537, Train Steps/Sec: 11.62, Grad Norm: 0.0102 +[2025-02-19 16:50:57] (step=0059300) Train Loss: 0.1536, Train Steps/Sec: 11.63, Grad Norm: 0.0103 +[2025-02-19 16:51:05] (step=0059400) Train Loss: 0.1545, Train Steps/Sec: 11.62, Grad Norm: 0.0101 +[2025-02-19 16:51:14] (step=0059500) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0104 +[2025-02-19 16:51:23] (step=0059600) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0104 +[2025-02-19 16:51:33] (step=0059700) Train Loss: 0.1525, Train Steps/Sec: 9.96, Grad Norm: 0.0103 +[2025-02-19 16:51:42] (step=0059800) Train Loss: 0.1546, Train Steps/Sec: 10.64, Grad Norm: 0.0104 +[2025-02-19 16:51:51] (step=0059900) Train Loss: 0.1536, Train Steps/Sec: 11.63, Grad Norm: 0.0101 +[2025-02-19 16:51:59] (step=0060000) Train Loss: 0.1536, Train Steps/Sec: 11.63, Grad Norm: 0.0103 +[2025-02-19 16:52:00] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0060000.pt +[2025-02-19 16:52:10] (step=0060100) Train Loss: 0.1527, Train Steps/Sec: 9.02, Grad Norm: 0.0102 +[2025-02-19 16:52:19] (step=0060200) Train Loss: 0.1529, Train Steps/Sec: 11.62, Grad Norm: 0.0106 +[2025-02-19 16:52:28] (step=0060300) Train Loss: 0.1541, Train Steps/Sec: 11.62, Grad Norm: 0.0103 +[2025-02-19 16:52:36] (step=0060400) Train Loss: 0.1539, Train Steps/Sec: 11.62, Grad Norm: 0.0104 +[2025-02-19 16:52:45] (step=0060500) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0103 +[2025-02-19 16:52:53] (step=0060600) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0100 +[2025-02-19 16:53:02] (step=0060700) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0103 +[2025-02-19 16:53:11] (step=0060800) Train Loss: 0.1540, Train Steps/Sec: 11.62, Grad Norm: 0.0103 +[2025-02-19 16:53:19] (step=0060900) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0104 +[2025-02-19 16:53:28] (step=0061000) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0106 +[2025-02-19 16:53:36] (step=0061100) Train Loss: 0.1549, Train Steps/Sec: 11.63, Grad Norm: 0.0102 +[2025-02-19 16:53:45] (step=0061200) Train Loss: 0.1534, Train Steps/Sec: 11.62, Grad Norm: 0.0099 +[2025-02-19 16:53:55] (step=0061300) Train Loss: 0.1537, Train Steps/Sec: 9.98, Grad Norm: 0.0100 +[2025-02-19 16:54:04] (step=0061400) Train Loss: 0.1534, Train Steps/Sec: 11.62, Grad Norm: 0.0101 +[2025-02-19 16:54:12] (step=0061500) Train Loss: 0.1535, Train Steps/Sec: 11.61, Grad Norm: 0.0100 +[2025-02-19 16:54:21] (step=0061600) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0101 +[2025-02-19 16:54:30] (step=0061700) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0103 +[2025-02-19 16:54:38] (step=0061800) Train Loss: 0.1534, Train Steps/Sec: 11.25, Grad Norm: 0.0104 +[2025-02-19 16:54:48] (step=0061900) Train Loss: 0.1524, Train Steps/Sec: 10.00, Grad Norm: 0.0101 +[2025-02-19 16:54:58] (step=0062000) Train Loss: 0.1526, Train Steps/Sec: 10.97, Grad Norm: 0.0100 +[2025-02-19 16:55:06] (step=0062100) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0102 +[2025-02-19 16:55:15] (step=0062200) Train Loss: 0.1545, Train Steps/Sec: 11.63, Grad Norm: 0.0100 +[2025-02-19 16:55:23] (step=0062300) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0106 +[2025-02-19 16:55:32] (step=0062400) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0101 +[2025-02-19 16:55:41] (step=0062500) Train Loss: 0.1543, Train Steps/Sec: 11.61, Grad Norm: 0.0103 +[2025-02-19 16:55:51] (step=0062600) Train Loss: 0.1537, Train Steps/Sec: 9.88, Grad Norm: 0.0105 +[2025-02-19 16:55:59] (step=0062700) Train Loss: 0.1551, Train Steps/Sec: 11.61, Grad Norm: 0.0102 +[2025-02-19 16:56:08] (step=0062800) Train Loss: 0.1547, Train Steps/Sec: 11.60, Grad Norm: 0.0101 +[2025-02-19 16:56:17] (step=0062900) Train Loss: 0.1543, Train Steps/Sec: 11.60, Grad Norm: 0.0102 +[2025-02-19 16:56:25] (step=0063000) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0102 +[2025-02-19 16:56:34] (step=0063100) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0101 +[2025-02-19 16:56:42] (step=0063200) Train Loss: 0.1531, Train Steps/Sec: 11.59, Grad Norm: 0.0099 +[2025-02-19 16:56:51] (step=0063300) Train Loss: 0.1536, Train Steps/Sec: 11.60, Grad Norm: 0.0103 +[2025-02-19 16:57:00] (step=0063400) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0102 +[2025-02-19 16:57:08] (step=0063500) Train Loss: 0.1532, Train Steps/Sec: 11.59, Grad Norm: 0.0101 +[2025-02-19 16:57:17] (step=0063600) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0101 +[2025-02-19 16:57:26] (step=0063700) Train Loss: 0.1540, Train Steps/Sec: 11.59, Grad Norm: 0.0101 +[2025-02-19 16:57:34] (step=0063800) Train Loss: 0.1527, Train Steps/Sec: 11.55, Grad Norm: 0.0101 +[2025-02-19 16:57:44] (step=0063900) Train Loss: 0.1536, Train Steps/Sec: 9.98, Grad Norm: 0.0098 +[2025-02-19 16:57:53] (step=0064000) Train Loss: 0.1538, Train Steps/Sec: 10.90, Grad Norm: 0.0100 +[2025-02-19 16:58:04] (step=0064100) Train Loss: 0.1544, Train Steps/Sec: 9.74, Grad Norm: 0.0104 +[2025-02-19 16:58:12] (step=0064200) Train Loss: 0.1538, Train Steps/Sec: 11.61, Grad Norm: 0.0099 +[2025-02-19 16:58:21] (step=0064300) Train Loss: 0.1535, Train Steps/Sec: 11.63, Grad Norm: 0.0102 +[2025-02-19 16:58:29] (step=0064400) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0101 +[2025-02-19 16:58:38] (step=0064500) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0101 +[2025-02-19 16:58:47] (step=0064600) Train Loss: 0.1553, Train Steps/Sec: 11.61, Grad Norm: 0.0102 +[2025-02-19 16:58:55] (step=0064700) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0100 +[2025-02-19 16:59:04] (step=0064800) Train Loss: 0.1534, Train Steps/Sec: 11.60, Grad Norm: 0.0099 +[2025-02-19 16:59:13] (step=0064900) Train Loss: 0.1539, Train Steps/Sec: 11.61, Grad Norm: 0.0098 +[2025-02-19 16:59:21] (step=0065000) Train Loss: 0.1551, Train Steps/Sec: 11.60, Grad Norm: 0.0099 +[2025-02-19 16:59:31] (step=0065100) Train Loss: 0.1540, Train Steps/Sec: 9.90, Grad Norm: 0.0104 +[2025-02-19 16:59:40] (step=0065200) Train Loss: 0.1537, Train Steps/Sec: 11.61, Grad Norm: 0.0098 +[2025-02-19 16:59:49] (step=0065300) Train Loss: 0.1533, Train Steps/Sec: 11.60, Grad Norm: 0.0098 +[2025-02-19 16:59:57] (step=0065400) Train Loss: 0.1535, Train Steps/Sec: 11.61, Grad Norm: 0.0099 +[2025-02-19 17:00:06] (step=0065500) Train Loss: 0.1539, Train Steps/Sec: 11.62, Grad Norm: 0.0104 +[2025-02-19 17:00:14] (step=0065600) Train Loss: 0.1541, Train Steps/Sec: 11.62, Grad Norm: 0.0098 +[2025-02-19 17:00:23] (step=0065700) Train Loss: 0.1539, Train Steps/Sec: 11.62, Grad Norm: 0.0100 +[2025-02-19 17:00:32] (step=0065800) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0097 +[2025-02-19 17:00:40] (step=0065900) Train Loss: 0.1539, Train Steps/Sec: 11.61, Grad Norm: 0.0104 +[2025-02-19 17:00:49] (step=0066000) Train Loss: 0.1534, Train Steps/Sec: 11.62, Grad Norm: 0.0100 +[2025-02-19 17:00:57] (step=0066100) Train Loss: 0.1543, Train Steps/Sec: 11.62, Grad Norm: 0.0102 +[2025-02-19 17:01:07] (step=0066200) Train Loss: 0.1534, Train Steps/Sec: 10.55, Grad Norm: 0.0100 +[2025-02-19 17:01:17] (step=0066300) Train Loss: 0.1529, Train Steps/Sec: 10.01, Grad Norm: 0.0099 +[2025-02-19 17:01:27] (step=0066400) Train Loss: 0.1533, Train Steps/Sec: 9.93, Grad Norm: 0.0101 +[2025-02-19 17:01:36] (step=0066500) Train Loss: 0.1540, Train Steps/Sec: 11.60, Grad Norm: 0.0098 +[2025-02-19 17:01:44] (step=0066600) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0096 +[2025-02-19 17:01:53] (step=0066700) Train Loss: 0.1524, Train Steps/Sec: 11.56, Grad Norm: 0.0097 +[2025-02-19 17:02:01] (step=0066800) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0098 +[2025-02-19 17:02:10] (step=0066900) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0099 +[2025-02-19 17:02:19] (step=0067000) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0101 +[2025-02-19 17:02:27] (step=0067100) Train Loss: 0.1549, Train Steps/Sec: 11.61, Grad Norm: 0.0098 +[2025-02-19 17:02:36] (step=0067200) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0102 +[2025-02-19 17:02:45] (step=0067300) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0103 +[2025-02-19 17:02:53] (step=0067400) Train Loss: 0.1552, Train Steps/Sec: 11.61, Grad Norm: 0.0097 +[2025-02-19 17:03:02] (step=0067500) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0098 +[2025-02-19 17:03:12] (step=0067600) Train Loss: 0.1540, Train Steps/Sec: 9.97, Grad Norm: 0.0098 +[2025-02-19 17:03:20] (step=0067700) Train Loss: 0.1537, Train Steps/Sec: 11.59, Grad Norm: 0.0102 +[2025-02-19 17:03:29] (step=0067800) Train Loss: 0.1540, Train Steps/Sec: 11.60, Grad Norm: 0.0097 +[2025-02-19 17:03:38] (step=0067900) Train Loss: 0.1537, Train Steps/Sec: 11.60, Grad Norm: 0.0099 +[2025-02-19 17:03:46] (step=0068000) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0098 +[2025-02-19 17:03:55] (step=0068100) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0100 +[2025-02-19 17:04:04] (step=0068200) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0100 +[2025-02-19 17:04:12] (step=0068300) Train Loss: 0.1531, Train Steps/Sec: 11.21, Grad Norm: 0.0098 +[2025-02-19 17:04:22] (step=0068400) Train Loss: 0.1544, Train Steps/Sec: 10.01, Grad Norm: 0.0097 +[2025-02-19 17:04:32] (step=0068500) Train Loss: 0.1534, Train Steps/Sec: 10.91, Grad Norm: 0.0097 +[2025-02-19 17:04:40] (step=0068600) Train Loss: 0.1541, Train Steps/Sec: 11.59, Grad Norm: 0.0096 +[2025-02-19 17:04:49] (step=0068700) Train Loss: 0.1538, Train Steps/Sec: 11.60, Grad Norm: 0.0097 +[2025-02-19 17:04:57] (step=0068800) Train Loss: 0.1543, Train Steps/Sec: 11.56, Grad Norm: 0.0094 +[2025-02-19 17:05:08] (step=0068900) Train Loss: 0.1522, Train Steps/Sec: 9.96, Grad Norm: 0.0096 +[2025-02-19 17:05:16] (step=0069000) Train Loss: 0.1537, Train Steps/Sec: 11.65, Grad Norm: 0.0100 +[2025-02-19 17:05:25] (step=0069100) Train Loss: 0.1542, Train Steps/Sec: 11.63, Grad Norm: 0.0098 +[2025-02-19 17:05:33] (step=0069200) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0098 +[2025-02-19 17:05:42] (step=0069300) Train Loss: 0.1526, Train Steps/Sec: 11.64, Grad Norm: 0.0096 +[2025-02-19 17:05:50] (step=0069400) Train Loss: 0.1534, Train Steps/Sec: 11.65, Grad Norm: 0.0101 +[2025-02-19 17:05:59] (step=0069500) Train Loss: 0.1537, Train Steps/Sec: 11.65, Grad Norm: 0.0099 +[2025-02-19 17:06:08] (step=0069600) Train Loss: 0.1531, Train Steps/Sec: 11.65, Grad Norm: 0.0098 +[2025-02-19 17:06:16] (step=0069700) Train Loss: 0.1529, Train Steps/Sec: 11.65, Grad Norm: 0.0095 +[2025-02-19 17:06:25] (step=0069800) Train Loss: 0.1550, Train Steps/Sec: 11.66, Grad Norm: 0.0096 +[2025-02-19 17:06:33] (step=0069900) Train Loss: 0.1532, Train Steps/Sec: 11.66, Grad Norm: 0.0095 +[2025-02-19 17:06:42] (step=0070000) Train Loss: 0.1549, Train Steps/Sec: 11.64, Grad Norm: 0.0100 +[2025-02-19 17:06:52] (step=0070100) Train Loss: 0.1534, Train Steps/Sec: 9.91, Grad Norm: 0.0095 +[2025-02-19 17:07:01] (step=0070200) Train Loss: 0.1540, Train Steps/Sec: 11.61, Grad Norm: 0.0097 +[2025-02-19 17:07:09] (step=0070300) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0097 +[2025-02-19 17:07:18] (step=0070400) Train Loss: 0.1534, Train Steps/Sec: 11.63, Grad Norm: 0.0096 +[2025-02-19 17:07:27] (step=0070500) Train Loss: 0.1529, Train Steps/Sec: 11.24, Grad Norm: 0.0098 +[2025-02-19 17:07:37] (step=0070600) Train Loss: 0.1516, Train Steps/Sec: 10.01, Grad Norm: 0.0096 +[2025-02-19 17:07:46] (step=0070700) Train Loss: 0.1530, Train Steps/Sec: 10.94, Grad Norm: 0.0097 +[2025-02-19 17:07:55] (step=0070800) Train Loss: 0.1541, Train Steps/Sec: 11.63, Grad Norm: 0.0094 +[2025-02-19 17:08:03] (step=0070900) Train Loss: 0.1534, Train Steps/Sec: 11.63, Grad Norm: 0.0095 +[2025-02-19 17:08:12] (step=0071000) Train Loss: 0.1537, Train Steps/Sec: 11.62, Grad Norm: 0.0096 +[2025-02-19 17:08:20] (step=0071100) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0097 +[2025-02-19 17:08:29] (step=0071200) Train Loss: 0.1545, Train Steps/Sec: 11.62, Grad Norm: 0.0099 +[2025-02-19 17:08:38] (step=0071300) Train Loss: 0.1533, Train Steps/Sec: 11.58, Grad Norm: 0.0095 +[2025-02-19 17:08:48] (step=0071400) Train Loss: 0.1537, Train Steps/Sec: 9.98, Grad Norm: 0.0096 +[2025-02-19 17:08:56] (step=0071500) Train Loss: 0.1549, Train Steps/Sec: 11.63, Grad Norm: 0.0102 +[2025-02-19 17:09:05] (step=0071600) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0099 +[2025-02-19 17:09:13] (step=0071700) Train Loss: 0.1526, Train Steps/Sec: 11.63, Grad Norm: 0.0095 +[2025-02-19 17:09:22] (step=0071800) Train Loss: 0.1534, Train Steps/Sec: 11.63, Grad Norm: 0.0098 +[2025-02-19 17:09:31] (step=0071900) Train Loss: 0.1529, Train Steps/Sec: 11.63, Grad Norm: 0.0094 +[2025-02-19 17:09:39] (step=0072000) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0096 +[2025-02-19 17:09:48] (step=0072100) Train Loss: 0.1532, Train Steps/Sec: 11.60, Grad Norm: 0.0095 +[2025-02-19 17:09:56] (step=0072200) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0096 +[2025-02-19 17:10:05] (step=0072300) Train Loss: 0.1532, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:10:14] (step=0072400) Train Loss: 0.1538, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:10:22] (step=0072500) Train Loss: 0.1535, Train Steps/Sec: 11.61, Grad Norm: 0.0096 +[2025-02-19 17:10:32] (step=0072600) Train Loss: 0.1541, Train Steps/Sec: 9.92, Grad Norm: 0.0095 +[2025-02-19 17:10:42] (step=0072700) Train Loss: 0.1530, Train Steps/Sec: 10.22, Grad Norm: 0.0096 +[2025-02-19 17:10:52] (step=0072800) Train Loss: 0.1545, Train Steps/Sec: 10.33, Grad Norm: 0.0097 +[2025-02-19 17:11:01] (step=0072900) Train Loss: 0.1520, Train Steps/Sec: 11.61, Grad Norm: 0.0099 +[2025-02-19 17:11:09] (step=0073000) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0098 +[2025-02-19 17:11:18] (step=0073100) Train Loss: 0.1533, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:11:26] (step=0073200) Train Loss: 0.1540, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:11:35] (step=0073300) Train Loss: 0.1533, Train Steps/Sec: 11.60, Grad Norm: 0.0096 +[2025-02-19 17:11:44] (step=0073400) Train Loss: 0.1527, Train Steps/Sec: 11.60, Grad Norm: 0.0098 +[2025-02-19 17:11:52] (step=0073500) Train Loss: 0.1541, Train Steps/Sec: 11.61, Grad Norm: 0.0093 +[2025-02-19 17:12:01] (step=0073600) Train Loss: 0.1540, Train Steps/Sec: 11.59, Grad Norm: 0.0095 +[2025-02-19 17:12:10] (step=0073700) Train Loss: 0.1525, Train Steps/Sec: 11.58, Grad Norm: 0.0095 +[2025-02-19 17:12:18] (step=0073800) Train Loss: 0.1530, Train Steps/Sec: 11.55, Grad Norm: 0.0096 +[2025-02-19 17:12:28] (step=0073900) Train Loss: 0.1535, Train Steps/Sec: 10.01, Grad Norm: 0.0094 +[2025-02-19 17:12:37] (step=0074000) Train Loss: 0.1538, Train Steps/Sec: 11.64, Grad Norm: 0.0094 +[2025-02-19 17:12:45] (step=0074100) Train Loss: 0.1534, Train Steps/Sec: 11.64, Grad Norm: 0.0096 +[2025-02-19 17:12:54] (step=0074200) Train Loss: 0.1536, Train Steps/Sec: 11.63, Grad Norm: 0.0098 +[2025-02-19 17:13:03] (step=0074300) Train Loss: 0.1532, Train Steps/Sec: 11.63, Grad Norm: 0.0095 +[2025-02-19 17:13:11] (step=0074400) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0094 +[2025-02-19 17:13:20] (step=0074500) Train Loss: 0.1526, Train Steps/Sec: 11.64, Grad Norm: 0.0097 +[2025-02-19 17:13:28] (step=0074600) Train Loss: 0.1540, Train Steps/Sec: 11.63, Grad Norm: 0.0095 +[2025-02-19 17:13:37] (step=0074700) Train Loss: 0.1532, Train Steps/Sec: 11.64, Grad Norm: 0.0096 +[2025-02-19 17:13:46] (step=0074800) Train Loss: 0.1535, Train Steps/Sec: 11.63, Grad Norm: 0.0095 +[2025-02-19 17:13:55] (step=0074900) Train Loss: 0.1528, Train Steps/Sec: 10.26, Grad Norm: 0.0096 +[2025-02-19 17:14:05] (step=0075000) Train Loss: 0.1535, Train Steps/Sec: 10.36, Grad Norm: 0.0094 +[2025-02-19 17:14:15] (step=0075100) Train Loss: 0.1538, Train Steps/Sec: 9.99, Grad Norm: 0.0092 +[2025-02-19 17:14:24] (step=0075200) Train Loss: 0.1539, Train Steps/Sec: 11.63, Grad Norm: 0.0094 +[2025-02-19 17:14:32] (step=0075300) Train Loss: 0.1540, Train Steps/Sec: 11.63, Grad Norm: 0.0096 +[2025-02-19 17:14:41] (step=0075400) Train Loss: 0.1529, Train Steps/Sec: 11.63, Grad Norm: 0.0097 +[2025-02-19 17:14:49] (step=0075500) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0096 +[2025-02-19 17:14:58] (step=0075600) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0093 +[2025-02-19 17:15:07] (step=0075700) Train Loss: 0.1534, Train Steps/Sec: 11.63, Grad Norm: 0.0097 +[2025-02-19 17:15:15] (step=0075800) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0096 +[2025-02-19 17:15:24] (step=0075900) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0093 +[2025-02-19 17:15:32] (step=0076000) Train Loss: 0.1546, Train Steps/Sec: 11.62, Grad Norm: 0.0096 +[2025-02-19 17:15:41] (step=0076100) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0094 +[2025-02-19 17:15:50] (step=0076200) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0098 +[2025-02-19 17:15:58] (step=0076300) Train Loss: 0.1532, Train Steps/Sec: 11.59, Grad Norm: 0.0095 +[2025-02-19 17:16:08] (step=0076400) Train Loss: 0.1527, Train Steps/Sec: 9.94, Grad Norm: 0.0093 +[2025-02-19 17:16:17] (step=0076500) Train Loss: 0.1540, Train Steps/Sec: 11.60, Grad Norm: 0.0096 +[2025-02-19 17:16:26] (step=0076600) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0094 +[2025-02-19 17:16:34] (step=0076700) Train Loss: 0.1524, Train Steps/Sec: 11.48, Grad Norm: 0.0095 +[2025-02-19 17:16:43] (step=0076800) Train Loss: 0.1524, Train Steps/Sec: 11.45, Grad Norm: 0.0092 +[2025-02-19 17:16:52] (step=0076900) Train Loss: 0.1534, Train Steps/Sec: 11.45, Grad Norm: 0.0092 +[2025-02-19 17:17:01] (step=0077000) Train Loss: 0.1523, Train Steps/Sec: 11.08, Grad Norm: 0.0095 +[2025-02-19 17:17:11] (step=0077100) Train Loss: 0.1545, Train Steps/Sec: 9.86, Grad Norm: 0.0097 +[2025-02-19 17:17:20] (step=0077200) Train Loss: 0.1542, Train Steps/Sec: 10.81, Grad Norm: 0.0095 +[2025-02-19 17:17:29] (step=0077300) Train Loss: 0.1520, Train Steps/Sec: 11.46, Grad Norm: 0.0092 +[2025-02-19 17:17:38] (step=0077400) Train Loss: 0.1532, Train Steps/Sec: 11.46, Grad Norm: 0.0091 +[2025-02-19 17:17:46] (step=0077500) Train Loss: 0.1525, Train Steps/Sec: 11.46, Grad Norm: 0.0092 +[2025-02-19 17:17:57] (step=0077600) Train Loss: 0.1538, Train Steps/Sec: 9.82, Grad Norm: 0.0095 +[2025-02-19 17:18:05] (step=0077700) Train Loss: 0.1525, Train Steps/Sec: 11.58, Grad Norm: 0.0093 +[2025-02-19 17:18:14] (step=0077800) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0096 +[2025-02-19 17:18:22] (step=0077900) Train Loss: 0.1532, Train Steps/Sec: 11.59, Grad Norm: 0.0091 +[2025-02-19 17:18:31] (step=0078000) Train Loss: 0.1521, Train Steps/Sec: 11.59, Grad Norm: 0.0092 +[2025-02-19 17:18:40] (step=0078100) Train Loss: 0.1534, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:18:48] (step=0078200) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:18:57] (step=0078300) Train Loss: 0.1543, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:19:06] (step=0078400) Train Loss: 0.1534, Train Steps/Sec: 11.59, Grad Norm: 0.0097 +[2025-02-19 17:19:14] (step=0078500) Train Loss: 0.1537, Train Steps/Sec: 11.58, Grad Norm: 0.0091 +[2025-02-19 17:19:23] (step=0078600) Train Loss: 0.1545, Train Steps/Sec: 11.60, Grad Norm: 0.0092 +[2025-02-19 17:19:31] (step=0078700) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0097 +[2025-02-19 17:19:40] (step=0078800) Train Loss: 0.1533, Train Steps/Sec: 11.55, Grad Norm: 0.0093 +[2025-02-19 17:19:50] (step=0078900) Train Loss: 0.1536, Train Steps/Sec: 9.93, Grad Norm: 0.0095 +[2025-02-19 17:19:59] (step=0079000) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0094 +[2025-02-19 17:20:07] (step=0079100) Train Loss: 0.1532, Train Steps/Sec: 11.59, Grad Norm: 0.0090 +[2025-02-19 17:20:17] (step=0079200) Train Loss: 0.1528, Train Steps/Sec: 10.85, Grad Norm: 0.0095 +[2025-02-19 17:20:26] (step=0079300) Train Loss: 0.1517, Train Steps/Sec: 10.59, Grad Norm: 0.0095 +[2025-02-19 17:20:35] (step=0079400) Train Loss: 0.1535, Train Steps/Sec: 10.92, Grad Norm: 0.0091 +[2025-02-19 17:20:44] (step=0079500) Train Loss: 0.1540, Train Steps/Sec: 11.59, Grad Norm: 0.0095 +[2025-02-19 17:20:53] (step=0079600) Train Loss: 0.1525, Train Steps/Sec: 11.59, Grad Norm: 0.0094 +[2025-02-19 17:21:01] (step=0079700) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0094 +[2025-02-19 17:21:10] (step=0079800) Train Loss: 0.1520, Train Steps/Sec: 11.61, Grad Norm: 0.0091 +[2025-02-19 17:21:18] (step=0079900) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0094 +[2025-02-19 17:21:27] (step=0080000) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0094 +[2025-02-19 17:21:28] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0080000.pt +[2025-02-19 17:21:38] (step=0080100) Train Loss: 0.1538, Train Steps/Sec: 8.85, Grad Norm: 0.0090 +[2025-02-19 17:21:47] (step=0080200) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0095 +[2025-02-19 17:21:56] (step=0080300) Train Loss: 0.1526, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:22:04] (step=0080400) Train Loss: 0.1540, Train Steps/Sec: 11.59, Grad Norm: 0.0092 +[2025-02-19 17:22:13] (step=0080500) Train Loss: 0.1534, Train Steps/Sec: 11.59, Grad Norm: 0.0090 +[2025-02-19 17:22:21] (step=0080600) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0092 +[2025-02-19 17:22:30] (step=0080700) Train Loss: 0.1522, Train Steps/Sec: 11.59, Grad Norm: 0.0092 +[2025-02-19 17:22:39] (step=0080800) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:22:47] (step=0080900) Train Loss: 0.1539, Train Steps/Sec: 11.59, Grad Norm: 0.0089 +[2025-02-19 17:22:56] (step=0081000) Train Loss: 0.1540, Train Steps/Sec: 11.58, Grad Norm: 0.0092 +[2025-02-19 17:23:05] (step=0081100) Train Loss: 0.1524, Train Steps/Sec: 11.60, Grad Norm: 0.0095 +[2025-02-19 17:23:13] (step=0081200) Train Loss: 0.1543, Train Steps/Sec: 11.60, Grad Norm: 0.0092 +[2025-02-19 17:23:22] (step=0081300) Train Loss: 0.1532, Train Steps/Sec: 11.55, Grad Norm: 0.0098 +[2025-02-19 17:23:33] (step=0081400) Train Loss: 0.1527, Train Steps/Sec: 9.23, Grad Norm: 0.0091 +[2025-02-19 17:23:42] (step=0081500) Train Loss: 0.1525, Train Steps/Sec: 10.31, Grad Norm: 0.0093 +[2025-02-19 17:23:51] (step=0081600) Train Loss: 0.1529, Train Steps/Sec: 11.58, Grad Norm: 0.0094 +[2025-02-19 17:24:00] (step=0081700) Train Loss: 0.1535, Train Steps/Sec: 11.59, Grad Norm: 0.0091 +[2025-02-19 17:24:08] (step=0081800) Train Loss: 0.1537, Train Steps/Sec: 11.58, Grad Norm: 0.0094 +[2025-02-19 17:24:17] (step=0081900) Train Loss: 0.1532, Train Steps/Sec: 11.58, Grad Norm: 0.0093 +[2025-02-19 17:24:26] (step=0082000) Train Loss: 0.1531, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:24:34] (step=0082100) Train Loss: 0.1534, Train Steps/Sec: 11.59, Grad Norm: 0.0090 +[2025-02-19 17:24:43] (step=0082200) Train Loss: 0.1534, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:24:51] (step=0082300) Train Loss: 0.1526, Train Steps/Sec: 11.59, Grad Norm: 0.0090 +[2025-02-19 17:25:00] (step=0082400) Train Loss: 0.1510, Train Steps/Sec: 11.57, Grad Norm: 0.0093 +[2025-02-19 17:25:09] (step=0082500) Train Loss: 0.1528, Train Steps/Sec: 11.59, Grad Norm: 0.0093 +[2025-02-19 17:25:19] (step=0082600) Train Loss: 0.1533, Train Steps/Sec: 9.79, Grad Norm: 0.0095 +[2025-02-19 17:25:28] (step=0082700) Train Loss: 0.1545, Train Steps/Sec: 11.59, Grad Norm: 0.0092 +[2025-02-19 17:25:36] (step=0082800) Train Loss: 0.1533, Train Steps/Sec: 11.59, Grad Norm: 0.0091 +[2025-02-19 17:25:45] (step=0082900) Train Loss: 0.1534, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:25:53] (step=0083000) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0092 +[2025-02-19 17:26:02] (step=0083100) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0090 +[2025-02-19 17:26:11] (step=0083200) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:26:19] (step=0083300) Train Loss: 0.1529, Train Steps/Sec: 11.60, Grad Norm: 0.0090 +[2025-02-19 17:26:28] (step=0083400) Train Loss: 0.1524, Train Steps/Sec: 11.60, Grad Norm: 0.0092 +[2025-02-19 17:26:37] (step=0083500) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:26:46] (step=0083600) Train Loss: 0.1523, Train Steps/Sec: 10.50, Grad Norm: 0.0090 +[2025-02-19 17:26:56] (step=0083700) Train Loss: 0.1528, Train Steps/Sec: 10.33, Grad Norm: 0.0088 +[2025-02-19 17:27:04] (step=0083800) Train Loss: 0.1531, Train Steps/Sec: 11.55, Grad Norm: 0.0093 +[2025-02-19 17:27:14] (step=0083900) Train Loss: 0.1518, Train Steps/Sec: 10.07, Grad Norm: 0.0092 +[2025-02-19 17:27:23] (step=0084000) Train Loss: 0.1531, Train Steps/Sec: 11.62, Grad Norm: 0.0095 +[2025-02-19 17:27:32] (step=0084100) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0094 +[2025-02-19 17:27:40] (step=0084200) Train Loss: 0.1531, Train Steps/Sec: 11.61, Grad Norm: 0.0093 +[2025-02-19 17:27:49] (step=0084300) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0090 +[2025-02-19 17:27:57] (step=0084400) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0091 +[2025-02-19 17:28:06] (step=0084500) Train Loss: 0.1527, Train Steps/Sec: 11.61, Grad Norm: 0.0095 +[2025-02-19 17:28:15] (step=0084600) Train Loss: 0.1534, Train Steps/Sec: 11.62, Grad Norm: 0.0091 +[2025-02-19 17:28:23] (step=0084700) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0091 +[2025-02-19 17:28:32] (step=0084800) Train Loss: 0.1538, Train Steps/Sec: 11.61, Grad Norm: 0.0094 +[2025-02-19 17:28:40] (step=0084900) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0090 +[2025-02-19 17:28:49] (step=0085000) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0092 +[2025-02-19 17:28:59] (step=0085100) Train Loss: 0.1516, Train Steps/Sec: 9.95, Grad Norm: 0.0090 +[2025-02-19 17:29:08] (step=0085200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0092 +[2025-02-19 17:29:16] (step=0085300) Train Loss: 0.1538, Train Steps/Sec: 11.62, Grad Norm: 0.0093 +[2025-02-19 17:29:25] (step=0085400) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0093 +[2025-02-19 17:29:34] (step=0085500) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0091 +[2025-02-19 17:29:42] (step=0085600) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0093 +[2025-02-19 17:29:51] (step=0085700) Train Loss: 0.1528, Train Steps/Sec: 11.23, Grad Norm: 0.0092 +[2025-02-19 17:30:01] (step=0085800) Train Loss: 0.1532, Train Steps/Sec: 10.29, Grad Norm: 0.0090 +[2025-02-19 17:30:10] (step=0085900) Train Loss: 0.1535, Train Steps/Sec: 10.95, Grad Norm: 0.0089 +[2025-02-19 17:30:19] (step=0086000) Train Loss: 0.1526, Train Steps/Sec: 11.62, Grad Norm: 0.0090 +[2025-02-19 17:30:27] (step=0086100) Train Loss: 0.1537, Train Steps/Sec: 11.62, Grad Norm: 0.0093 +[2025-02-19 17:30:36] (step=0086200) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0089 +[2025-02-19 17:30:44] (step=0086300) Train Loss: 0.1521, Train Steps/Sec: 11.56, Grad Norm: 0.0090 +[2025-02-19 17:30:55] (step=0086400) Train Loss: 0.1538, Train Steps/Sec: 9.88, Grad Norm: 0.0091 +[2025-02-19 17:31:03] (step=0086500) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0092 +[2025-02-19 17:31:12] (step=0086600) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0091 +[2025-02-19 17:31:20] (step=0086700) Train Loss: 0.1538, Train Steps/Sec: 11.62, Grad Norm: 0.0089 +[2025-02-19 17:31:29] (step=0086800) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0090 +[2025-02-19 17:31:38] (step=0086900) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0091 +[2025-02-19 17:31:46] (step=0087000) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0092 +[2025-02-19 17:31:55] (step=0087100) Train Loss: 0.1547, Train Steps/Sec: 11.62, Grad Norm: 0.0091 +[2025-02-19 17:32:03] (step=0087200) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:32:12] (step=0087300) Train Loss: 0.1535, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:32:21] (step=0087400) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0090 +[2025-02-19 17:32:29] (step=0087500) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0094 +[2025-02-19 17:32:39] (step=0087600) Train Loss: 0.1532, Train Steps/Sec: 9.99, Grad Norm: 0.0094 +[2025-02-19 17:32:48] (step=0087700) Train Loss: 0.1533, Train Steps/Sec: 11.59, Grad Norm: 0.0094 +[2025-02-19 17:32:56] (step=0087800) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0088 +[2025-02-19 17:33:06] (step=0087900) Train Loss: 0.1520, Train Steps/Sec: 10.83, Grad Norm: 0.0087 +[2025-02-19 17:33:15] (step=0088000) Train Loss: 0.1535, Train Steps/Sec: 10.58, Grad Norm: 0.0088 +[2025-02-19 17:33:24] (step=0088100) Train Loss: 0.1529, Train Steps/Sec: 10.92, Grad Norm: 0.0091 +[2025-02-19 17:33:33] (step=0088200) Train Loss: 0.1534, Train Steps/Sec: 11.60, Grad Norm: 0.0091 +[2025-02-19 17:33:42] (step=0088300) Train Loss: 0.1527, Train Steps/Sec: 11.58, Grad Norm: 0.0089 +[2025-02-19 17:33:50] (step=0088400) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:33:59] (step=0088500) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:34:07] (step=0088600) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0088 +[2025-02-19 17:34:16] (step=0088700) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0090 +[2025-02-19 17:34:25] (step=0088800) Train Loss: 0.1534, Train Steps/Sec: 11.56, Grad Norm: 0.0089 +[2025-02-19 17:34:35] (step=0088900) Train Loss: 0.1529, Train Steps/Sec: 10.00, Grad Norm: 0.0090 +[2025-02-19 17:34:43] (step=0089000) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0089 +[2025-02-19 17:34:52] (step=0089100) Train Loss: 0.1542, Train Steps/Sec: 11.62, Grad Norm: 0.0092 +[2025-02-19 17:35:01] (step=0089200) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0088 +[2025-02-19 17:35:09] (step=0089300) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0094 +[2025-02-19 17:35:18] (step=0089400) Train Loss: 0.1527, Train Steps/Sec: 11.61, Grad Norm: 0.0090 +[2025-02-19 17:35:26] (step=0089500) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0091 +[2025-02-19 17:35:35] (step=0089600) Train Loss: 0.1551, Train Steps/Sec: 11.59, Grad Norm: 0.0090 +[2025-02-19 17:35:44] (step=0089700) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0092 +[2025-02-19 17:35:52] (step=0089800) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:36:01] (step=0089900) Train Loss: 0.1538, Train Steps/Sec: 11.61, Grad Norm: 0.0094 +[2025-02-19 17:36:10] (step=0090000) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0090 +[2025-02-19 17:36:20] (step=0090100) Train Loss: 0.1532, Train Steps/Sec: 9.15, Grad Norm: 0.0087 +[2025-02-19 17:36:30] (step=0090200) Train Loss: 0.1525, Train Steps/Sec: 10.30, Grad Norm: 0.0088 +[2025-02-19 17:36:39] (step=0090300) Train Loss: 0.1533, Train Steps/Sec: 11.60, Grad Norm: 0.0091 +[2025-02-19 17:36:47] (step=0090400) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:36:56] (step=0090500) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:37:05] (step=0090600) Train Loss: 0.1526, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:37:13] (step=0090700) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:37:22] (step=0090800) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0091 +[2025-02-19 17:37:30] (step=0090900) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0093 +[2025-02-19 17:37:39] (step=0091000) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0088 +[2025-02-19 17:37:48] (step=0091100) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 17:37:56] (step=0091200) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:38:05] (step=0091300) Train Loss: 0.1539, Train Steps/Sec: 11.56, Grad Norm: 0.0092 +[2025-02-19 17:38:15] (step=0091400) Train Loss: 0.1547, Train Steps/Sec: 9.88, Grad Norm: 0.0089 +[2025-02-19 17:38:24] (step=0091500) Train Loss: 0.1533, Train Steps/Sec: 11.64, Grad Norm: 0.0088 +[2025-02-19 17:38:32] (step=0091600) Train Loss: 0.1528, Train Steps/Sec: 11.62, Grad Norm: 0.0088 +[2025-02-19 17:38:41] (step=0091700) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0092 +[2025-02-19 17:38:50] (step=0091800) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0089 +[2025-02-19 17:38:58] (step=0091900) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0087 +[2025-02-19 17:39:07] (step=0092000) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:39:15] (step=0092100) Train Loss: 0.1541, Train Steps/Sec: 11.64, Grad Norm: 0.0088 +[2025-02-19 17:39:24] (step=0092200) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0088 +[2025-02-19 17:39:34] (step=0092300) Train Loss: 0.1528, Train Steps/Sec: 9.97, Grad Norm: 0.0089 +[2025-02-19 17:39:43] (step=0092400) Train Loss: 0.1534, Train Steps/Sec: 10.62, Grad Norm: 0.0088 +[2025-02-19 17:39:52] (step=0092500) Train Loss: 0.1527, Train Steps/Sec: 11.60, Grad Norm: 0.0090 +[2025-02-19 17:40:02] (step=0092600) Train Loss: 0.1520, Train Steps/Sec: 9.88, Grad Norm: 0.0088 +[2025-02-19 17:40:11] (step=0092700) Train Loss: 0.1524, Train Steps/Sec: 11.45, Grad Norm: 0.0089 +[2025-02-19 17:40:20] (step=0092800) Train Loss: 0.1526, Train Steps/Sec: 11.44, Grad Norm: 0.0091 +[2025-02-19 17:40:28] (step=0092900) Train Loss: 0.1545, Train Steps/Sec: 11.46, Grad Norm: 0.0088 +[2025-02-19 17:40:37] (step=0093000) Train Loss: 0.1529, Train Steps/Sec: 11.46, Grad Norm: 0.0088 +[2025-02-19 17:40:46] (step=0093100) Train Loss: 0.1522, Train Steps/Sec: 11.46, Grad Norm: 0.0088 +[2025-02-19 17:40:55] (step=0093200) Train Loss: 0.1525, Train Steps/Sec: 11.44, Grad Norm: 0.0088 +[2025-02-19 17:41:03] (step=0093300) Train Loss: 0.1542, Train Steps/Sec: 11.42, Grad Norm: 0.0087 +[2025-02-19 17:41:12] (step=0093400) Train Loss: 0.1545, Train Steps/Sec: 11.42, Grad Norm: 0.0088 +[2025-02-19 17:41:21] (step=0093500) Train Loss: 0.1530, Train Steps/Sec: 11.42, Grad Norm: 0.0088 +[2025-02-19 17:41:30] (step=0093600) Train Loss: 0.1528, Train Steps/Sec: 11.42, Grad Norm: 0.0090 +[2025-02-19 17:41:38] (step=0093700) Train Loss: 0.1530, Train Steps/Sec: 11.42, Grad Norm: 0.0086 +[2025-02-19 17:41:47] (step=0093800) Train Loss: 0.1533, Train Steps/Sec: 11.40, Grad Norm: 0.0087 +[2025-02-19 17:41:57] (step=0093900) Train Loss: 0.1523, Train Steps/Sec: 9.92, Grad Norm: 0.0087 +[2025-02-19 17:42:06] (step=0094000) Train Loss: 0.1530, Train Steps/Sec: 11.59, Grad Norm: 0.0087 +[2025-02-19 17:42:14] (step=0094100) Train Loss: 0.1530, Train Steps/Sec: 11.58, Grad Norm: 0.0089 +[2025-02-19 17:42:23] (step=0094200) Train Loss: 0.1523, Train Steps/Sec: 11.58, Grad Norm: 0.0088 +[2025-02-19 17:42:32] (step=0094300) Train Loss: 0.1528, Train Steps/Sec: 11.57, Grad Norm: 0.0091 +[2025-02-19 17:42:40] (step=0094400) Train Loss: 0.1522, Train Steps/Sec: 11.57, Grad Norm: 0.0088 +[2025-02-19 17:42:51] (step=0094500) Train Loss: 0.1520, Train Steps/Sec: 9.72, Grad Norm: 0.0089 +[2025-02-19 17:43:00] (step=0094600) Train Loss: 0.1514, Train Steps/Sec: 10.88, Grad Norm: 0.0092 +[2025-02-19 17:43:08] (step=0094700) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:43:17] (step=0094800) Train Loss: 0.1541, Train Steps/Sec: 11.60, Grad Norm: 0.0087 +[2025-02-19 17:43:26] (step=0094900) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0086 +[2025-02-19 17:43:34] (step=0095000) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0092 +[2025-02-19 17:43:44] (step=0095100) Train Loss: 0.1528, Train Steps/Sec: 10.03, Grad Norm: 0.0088 +[2025-02-19 17:43:53] (step=0095200) Train Loss: 0.1520, Train Steps/Sec: 11.59, Grad Norm: 0.0088 +[2025-02-19 17:44:02] (step=0095300) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0087 +[2025-02-19 17:44:10] (step=0095400) Train Loss: 0.1524, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:44:19] (step=0095500) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0094 +[2025-02-19 17:44:27] (step=0095600) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0088 +[2025-02-19 17:44:36] (step=0095700) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0092 +[2025-02-19 17:44:45] (step=0095800) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0090 +[2025-02-19 17:44:53] (step=0095900) Train Loss: 0.1533, Train Steps/Sec: 11.60, Grad Norm: 0.0086 +[2025-02-19 17:45:02] (step=0096000) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0086 +[2025-02-19 17:45:11] (step=0096100) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0089 +[2025-02-19 17:45:19] (step=0096200) Train Loss: 0.1526, Train Steps/Sec: 11.59, Grad Norm: 0.0086 +[2025-02-19 17:45:28] (step=0096300) Train Loss: 0.1523, Train Steps/Sec: 11.56, Grad Norm: 0.0089 +[2025-02-19 17:45:38] (step=0096400) Train Loss: 0.1545, Train Steps/Sec: 10.07, Grad Norm: 0.0089 +[2025-02-19 17:45:46] (step=0096500) Train Loss: 0.1531, Train Steps/Sec: 11.62, Grad Norm: 0.0090 +[2025-02-19 17:45:56] (step=0096600) Train Loss: 0.1531, Train Steps/Sec: 10.92, Grad Norm: 0.0086 +[2025-02-19 17:46:06] (step=0096700) Train Loss: 0.1542, Train Steps/Sec: 9.96, Grad Norm: 0.0090 +[2025-02-19 17:46:14] (step=0096800) Train Loss: 0.1517, Train Steps/Sec: 11.27, Grad Norm: 0.0089 +[2025-02-19 17:46:23] (step=0096900) Train Loss: 0.1538, Train Steps/Sec: 11.62, Grad Norm: 0.0087 +[2025-02-19 17:46:32] (step=0097000) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0086 +[2025-02-19 17:46:40] (step=0097100) Train Loss: 0.1526, Train Steps/Sec: 11.63, Grad Norm: 0.0087 +[2025-02-19 17:46:49] (step=0097200) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0086 +[2025-02-19 17:46:57] (step=0097300) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0087 +[2025-02-19 17:47:06] (step=0097400) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0089 +[2025-02-19 17:47:15] (step=0097500) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 17:47:25] (step=0097600) Train Loss: 0.1520, Train Steps/Sec: 10.04, Grad Norm: 0.0088 +[2025-02-19 17:47:33] (step=0097700) Train Loss: 0.1522, Train Steps/Sec: 11.57, Grad Norm: 0.0086 +[2025-02-19 17:47:42] (step=0097800) Train Loss: 0.1534, Train Steps/Sec: 11.57, Grad Norm: 0.0091 +[2025-02-19 17:47:51] (step=0097900) Train Loss: 0.1523, Train Steps/Sec: 11.57, Grad Norm: 0.0085 +[2025-02-19 17:47:59] (step=0098000) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:48:08] (step=0098100) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0087 +[2025-02-19 17:48:16] (step=0098200) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0091 +[2025-02-19 17:48:25] (step=0098300) Train Loss: 0.1531, Train Steps/Sec: 11.59, Grad Norm: 0.0086 +[2025-02-19 17:48:34] (step=0098400) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0086 +[2025-02-19 17:48:42] (step=0098500) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0087 +[2025-02-19 17:48:51] (step=0098600) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:49:00] (step=0098700) Train Loss: 0.1541, Train Steps/Sec: 11.60, Grad Norm: 0.0087 +[2025-02-19 17:49:09] (step=0098800) Train Loss: 0.1541, Train Steps/Sec: 10.21, Grad Norm: 0.0086 +[2025-02-19 17:49:20] (step=0098900) Train Loss: 0.1526, Train Steps/Sec: 9.04, Grad Norm: 0.0085 +[2025-02-19 17:49:29] (step=0099000) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0087 +[2025-02-19 17:49:38] (step=0099100) Train Loss: 0.1526, Train Steps/Sec: 11.60, Grad Norm: 0.0086 +[2025-02-19 17:49:46] (step=0099200) Train Loss: 0.1534, Train Steps/Sec: 11.62, Grad Norm: 0.0087 +[2025-02-19 17:49:55] (step=0099300) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0090 +[2025-02-19 17:50:03] (step=0099400) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:50:12] (step=0099500) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:50:21] (step=0099600) Train Loss: 0.1527, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:50:29] (step=0099700) Train Loss: 0.1526, Train Steps/Sec: 11.58, Grad Norm: 0.0088 +[2025-02-19 17:50:38] (step=0099800) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:50:47] (step=0099900) Train Loss: 0.1535, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:50:55] (step=0100000) Train Loss: 0.1529, Train Steps/Sec: 11.62, Grad Norm: 0.0088 +[2025-02-19 17:50:56] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0100000.pt +[2025-02-19 17:51:06] (step=0100100) Train Loss: 0.1531, Train Steps/Sec: 9.03, Grad Norm: 0.0088 +[2025-02-19 17:51:15] (step=0100200) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0086 +[2025-02-19 17:51:24] (step=0100300) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 17:51:32] (step=0100400) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0085 +[2025-02-19 17:51:41] (step=0100500) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 17:51:49] (step=0100600) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:51:58] (step=0100700) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:52:07] (step=0100800) Train Loss: 0.1526, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:52:15] (step=0100900) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0085 +[2025-02-19 17:52:25] (step=0101000) Train Loss: 0.1516, Train Steps/Sec: 10.28, Grad Norm: 0.0085 +[2025-02-19 17:52:34] (step=0101100) Train Loss: 0.1526, Train Steps/Sec: 10.60, Grad Norm: 0.0086 +[2025-02-19 17:52:43] (step=0101200) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0086 +[2025-02-19 17:52:52] (step=0101300) Train Loss: 0.1519, Train Steps/Sec: 11.57, Grad Norm: 0.0084 +[2025-02-19 17:53:02] (step=0101400) Train Loss: 0.1535, Train Steps/Sec: 10.02, Grad Norm: 0.0085 +[2025-02-19 17:53:10] (step=0101500) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 17:53:19] (step=0101600) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0089 +[2025-02-19 17:53:28] (step=0101700) Train Loss: 0.1532, Train Steps/Sec: 11.60, Grad Norm: 0.0085 +[2025-02-19 17:53:36] (step=0101800) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 17:53:45] (step=0101900) Train Loss: 0.1523, Train Steps/Sec: 11.62, Grad Norm: 0.0089 +[2025-02-19 17:53:53] (step=0102000) Train Loss: 0.1541, Train Steps/Sec: 11.62, Grad Norm: 0.0085 +[2025-02-19 17:54:02] (step=0102100) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:54:11] (step=0102200) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 17:54:19] (step=0102300) Train Loss: 0.1516, Train Steps/Sec: 11.59, Grad Norm: 0.0085 +[2025-02-19 17:54:28] (step=0102400) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0084 +[2025-02-19 17:54:36] (step=0102500) Train Loss: 0.1533, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 17:54:46] (step=0102600) Train Loss: 0.1522, Train Steps/Sec: 10.01, Grad Norm: 0.0083 +[2025-02-19 17:54:55] (step=0102700) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:55:04] (step=0102800) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0086 +[2025-02-19 17:55:12] (step=0102900) Train Loss: 0.1521, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 17:55:21] (step=0103000) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 17:55:30] (step=0103100) Train Loss: 0.1532, Train Steps/Sec: 11.60, Grad Norm: 0.0087 +[2025-02-19 17:55:40] (step=0103200) Train Loss: 0.1508, Train Steps/Sec: 9.71, Grad Norm: 0.0089 +[2025-02-19 17:55:49] (step=0103300) Train Loss: 0.1525, Train Steps/Sec: 10.89, Grad Norm: 0.0085 +[2025-02-19 17:55:58] (step=0103400) Train Loss: 0.1542, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:56:06] (step=0103500) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0084 +[2025-02-19 17:56:15] (step=0103600) Train Loss: 0.1547, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 17:56:23] (step=0103700) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 17:56:32] (step=0103800) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:56:42] (step=0103900) Train Loss: 0.1529, Train Steps/Sec: 10.01, Grad Norm: 0.0092 +[2025-02-19 17:56:51] (step=0104000) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 17:56:59] (step=0104100) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0085 +[2025-02-19 17:57:08] (step=0104200) Train Loss: 0.1541, Train Steps/Sec: 11.62, Grad Norm: 0.0086 +[2025-02-19 17:57:17] (step=0104300) Train Loss: 0.1531, Train Steps/Sec: 11.63, Grad Norm: 0.0086 +[2025-02-19 17:57:25] (step=0104400) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 17:57:34] (step=0104500) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 17:57:42] (step=0104600) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0089 +[2025-02-19 17:57:51] (step=0104700) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0090 +[2025-02-19 17:58:00] (step=0104800) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 17:58:08] (step=0104900) Train Loss: 0.1526, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 17:58:17] (step=0105000) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 17:58:27] (step=0105100) Train Loss: 0.1518, Train Steps/Sec: 10.02, Grad Norm: 0.0083 +[2025-02-19 17:58:35] (step=0105200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 17:58:45] (step=0105300) Train Loss: 0.1531, Train Steps/Sec: 10.89, Grad Norm: 0.0085 +[2025-02-19 17:58:54] (step=0105400) Train Loss: 0.1529, Train Steps/Sec: 10.27, Grad Norm: 0.0083 +[2025-02-19 17:59:03] (step=0105500) Train Loss: 0.1517, Train Steps/Sec: 11.28, Grad Norm: 0.0084 +[2025-02-19 17:59:12] (step=0105600) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0094 +[2025-02-19 17:59:20] (step=0105700) Train Loss: 0.1528, Train Steps/Sec: 11.62, Grad Norm: 0.0085 +[2025-02-19 17:59:29] (step=0105800) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 17:59:38] (step=0105900) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 17:59:46] (step=0106000) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0082 +[2025-02-19 17:59:55] (step=0106100) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0085 +[2025-02-19 18:00:03] (step=0106200) Train Loss: 0.1538, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 18:00:12] (step=0106300) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0086 +[2025-02-19 18:00:22] (step=0106400) Train Loss: 0.1537, Train Steps/Sec: 9.96, Grad Norm: 0.0085 +[2025-02-19 18:00:31] (step=0106500) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0086 +[2025-02-19 18:00:39] (step=0106600) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0083 +[2025-02-19 18:00:48] (step=0106700) Train Loss: 0.1522, Train Steps/Sec: 11.59, Grad Norm: 0.0085 +[2025-02-19 18:00:57] (step=0106800) Train Loss: 0.1536, Train Steps/Sec: 11.61, Grad Norm: 0.0089 +[2025-02-19 18:01:05] (step=0106900) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 18:01:14] (step=0107000) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0085 +[2025-02-19 18:01:22] (step=0107100) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 18:01:31] (step=0107200) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:01:40] (step=0107300) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:01:48] (step=0107400) Train Loss: 0.1520, Train Steps/Sec: 11.61, Grad Norm: 0.0085 +[2025-02-19 18:01:58] (step=0107500) Train Loss: 0.1524, Train Steps/Sec: 10.24, Grad Norm: 0.0084 +[2025-02-19 18:02:09] (step=0107600) Train Loss: 0.1518, Train Steps/Sec: 9.25, Grad Norm: 0.0086 +[2025-02-19 18:02:17] (step=0107700) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0086 +[2025-02-19 18:02:26] (step=0107800) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:02:35] (step=0107900) Train Loss: 0.1530, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:02:43] (step=0108000) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0085 +[2025-02-19 18:02:52] (step=0108100) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:03:00] (step=0108200) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0085 +[2025-02-19 18:03:09] (step=0108300) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:03:18] (step=0108400) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0088 +[2025-02-19 18:03:26] (step=0108500) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0084 +[2025-02-19 18:03:35] (step=0108600) Train Loss: 0.1519, Train Steps/Sec: 11.59, Grad Norm: 0.0088 +[2025-02-19 18:03:44] (step=0108700) Train Loss: 0.1528, Train Steps/Sec: 11.59, Grad Norm: 0.0087 +[2025-02-19 18:03:52] (step=0108800) Train Loss: 0.1523, Train Steps/Sec: 11.59, Grad Norm: 0.0083 +[2025-02-19 18:04:02] (step=0108900) Train Loss: 0.1523, Train Steps/Sec: 10.04, Grad Norm: 0.0083 +[2025-02-19 18:04:11] (step=0109000) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:04:19] (step=0109100) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0085 +[2025-02-19 18:04:28] (step=0109200) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0083 +[2025-02-19 18:04:37] (step=0109300) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0082 +[2025-02-19 18:04:45] (step=0109400) Train Loss: 0.1527, Train Steps/Sec: 11.59, Grad Norm: 0.0084 +[2025-02-19 18:04:54] (step=0109500) Train Loss: 0.1526, Train Steps/Sec: 11.55, Grad Norm: 0.0086 +[2025-02-19 18:05:03] (step=0109600) Train Loss: 0.1523, Train Steps/Sec: 11.54, Grad Norm: 0.0085 +[2025-02-19 18:05:12] (step=0109700) Train Loss: 0.1521, Train Steps/Sec: 10.20, Grad Norm: 0.0083 +[2025-02-19 18:05:22] (step=0109800) Train Loss: 0.1540, Train Steps/Sec: 10.58, Grad Norm: 0.0086 +[2025-02-19 18:05:30] (step=0109900) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0083 +[2025-02-19 18:05:39] (step=0110000) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0083 +[2025-02-19 18:05:49] (step=0110100) Train Loss: 0.1518, Train Steps/Sec: 10.04, Grad Norm: 0.0087 +[2025-02-19 18:05:58] (step=0110200) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:06:06] (step=0110300) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0086 +[2025-02-19 18:06:15] (step=0110400) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:06:23] (step=0110500) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:06:32] (step=0110600) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:06:41] (step=0110700) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0082 +[2025-02-19 18:06:49] (step=0110800) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0084 +[2025-02-19 18:06:58] (step=0110900) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:07:06] (step=0111000) Train Loss: 0.1534, Train Steps/Sec: 11.64, Grad Norm: 0.0084 +[2025-02-19 18:07:15] (step=0111100) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0085 +[2025-02-19 18:07:24] (step=0111200) Train Loss: 0.1536, Train Steps/Sec: 11.64, Grad Norm: 0.0083 +[2025-02-19 18:07:32] (step=0111300) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:07:42] (step=0111400) Train Loss: 0.1518, Train Steps/Sec: 10.03, Grad Norm: 0.0086 +[2025-02-19 18:07:51] (step=0111500) Train Loss: 0.1535, Train Steps/Sec: 11.62, Grad Norm: 0.0085 +[2025-02-19 18:07:59] (step=0111600) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:08:08] (step=0111700) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0086 +[2025-02-19 18:08:17] (step=0111800) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:08:27] (step=0111900) Train Loss: 0.1527, Train Steps/Sec: 10.01, Grad Norm: 0.0085 +[2025-02-19 18:08:36] (step=0112000) Train Loss: 0.1514, Train Steps/Sec: 10.93, Grad Norm: 0.0088 +[2025-02-19 18:08:44] (step=0112100) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 18:08:53] (step=0112200) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:09:02] (step=0112300) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0087 +[2025-02-19 18:09:10] (step=0112400) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0085 +[2025-02-19 18:09:19] (step=0112500) Train Loss: 0.1526, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:09:29] (step=0112600) Train Loss: 0.1518, Train Steps/Sec: 10.00, Grad Norm: 0.0085 +[2025-02-19 18:09:37] (step=0112700) Train Loss: 0.1526, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 18:09:46] (step=0112800) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 18:09:55] (step=0112900) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0085 +[2025-02-19 18:10:03] (step=0113000) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:10:12] (step=0113100) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 18:10:20] (step=0113200) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0085 +[2025-02-19 18:10:29] (step=0113300) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0082 +[2025-02-19 18:10:38] (step=0113400) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0083 +[2025-02-19 18:10:46] (step=0113500) Train Loss: 0.1530, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:10:55] (step=0113600) Train Loss: 0.1534, Train Steps/Sec: 11.64, Grad Norm: 0.0083 +[2025-02-19 18:11:03] (step=0113700) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0084 +[2025-02-19 18:11:12] (step=0113800) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0086 +[2025-02-19 18:11:22] (step=0113900) Train Loss: 0.1532, Train Steps/Sec: 10.03, Grad Norm: 0.0085 +[2025-02-19 18:11:31] (step=0114000) Train Loss: 0.1529, Train Steps/Sec: 10.56, Grad Norm: 0.0083 +[2025-02-19 18:11:41] (step=0114100) Train Loss: 0.1526, Train Steps/Sec: 10.29, Grad Norm: 0.0086 +[2025-02-19 18:11:50] (step=0114200) Train Loss: 0.1526, Train Steps/Sec: 11.28, Grad Norm: 0.0081 +[2025-02-19 18:11:59] (step=0114300) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0085 +[2025-02-19 18:12:07] (step=0114400) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:12:16] (step=0114500) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0082 +[2025-02-19 18:12:24] (step=0114600) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0085 +[2025-02-19 18:12:33] (step=0114700) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0087 +[2025-02-19 18:12:42] (step=0114800) Train Loss: 0.1532, Train Steps/Sec: 11.61, Grad Norm: 0.0082 +[2025-02-19 18:12:50] (step=0114900) Train Loss: 0.1531, Train Steps/Sec: 11.61, Grad Norm: 0.0081 +[2025-02-19 18:12:59] (step=0115000) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0081 +[2025-02-19 18:13:09] (step=0115100) Train Loss: 0.1525, Train Steps/Sec: 10.00, Grad Norm: 0.0084 +[2025-02-19 18:13:18] (step=0115200) Train Loss: 0.1524, Train Steps/Sec: 11.58, Grad Norm: 0.0081 +[2025-02-19 18:13:26] (step=0115300) Train Loss: 0.1526, Train Steps/Sec: 11.55, Grad Norm: 0.0082 +[2025-02-19 18:13:35] (step=0115400) Train Loss: 0.1522, Train Steps/Sec: 11.56, Grad Norm: 0.0081 +[2025-02-19 18:13:44] (step=0115500) Train Loss: 0.1541, Train Steps/Sec: 11.57, Grad Norm: 0.0082 +[2025-02-19 18:13:52] (step=0115600) Train Loss: 0.1526, Train Steps/Sec: 11.56, Grad Norm: 0.0083 +[2025-02-19 18:14:01] (step=0115700) Train Loss: 0.1523, Train Steps/Sec: 11.58, Grad Norm: 0.0083 +[2025-02-19 18:14:09] (step=0115800) Train Loss: 0.1528, Train Steps/Sec: 11.59, Grad Norm: 0.0086 +[2025-02-19 18:14:18] (step=0115900) Train Loss: 0.1522, Train Steps/Sec: 11.59, Grad Norm: 0.0084 +[2025-02-19 18:14:27] (step=0116000) Train Loss: 0.1527, Train Steps/Sec: 11.60, Grad Norm: 0.0082 +[2025-02-19 18:14:35] (step=0116100) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 18:14:45] (step=0116200) Train Loss: 0.1536, Train Steps/Sec: 9.96, Grad Norm: 0.0081 +[2025-02-19 18:14:55] (step=0116300) Train Loss: 0.1510, Train Steps/Sec: 10.88, Grad Norm: 0.0083 +[2025-02-19 18:15:05] (step=0116400) Train Loss: 0.1535, Train Steps/Sec: 9.75, Grad Norm: 0.0084 +[2025-02-19 18:15:13] (step=0116500) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:15:22] (step=0116600) Train Loss: 0.1519, Train Steps/Sec: 11.59, Grad Norm: 0.0081 +[2025-02-19 18:15:31] (step=0116700) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0082 +[2025-02-19 18:15:39] (step=0116800) Train Loss: 0.1527, Train Steps/Sec: 11.60, Grad Norm: 0.0088 +[2025-02-19 18:15:48] (step=0116900) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0080 +[2025-02-19 18:15:57] (step=0117000) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:16:05] (step=0117100) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0083 +[2025-02-19 18:16:14] (step=0117200) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0081 +[2025-02-19 18:16:22] (step=0117300) Train Loss: 0.1535, Train Steps/Sec: 11.60, Grad Norm: 0.0085 +[2025-02-19 18:16:31] (step=0117400) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:16:40] (step=0117500) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:16:50] (step=0117600) Train Loss: 0.1517, Train Steps/Sec: 10.04, Grad Norm: 0.0085 +[2025-02-19 18:16:58] (step=0117700) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0081 +[2025-02-19 18:17:07] (step=0117800) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:17:15] (step=0117900) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:17:24] (step=0118000) Train Loss: 0.1539, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:17:33] (step=0118100) Train Loss: 0.1523, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:17:41] (step=0118200) Train Loss: 0.1523, Train Steps/Sec: 11.64, Grad Norm: 0.0084 +[2025-02-19 18:17:50] (step=0118300) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:18:00] (step=0118400) Train Loss: 0.1508, Train Steps/Sec: 9.75, Grad Norm: 0.0082 +[2025-02-19 18:18:09] (step=0118500) Train Loss: 0.1527, Train Steps/Sec: 10.93, Grad Norm: 0.0082 +[2025-02-19 18:18:18] (step=0118600) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:18:26] (step=0118700) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:18:35] (step=0118800) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:18:45] (step=0118900) Train Loss: 0.1531, Train Steps/Sec: 10.07, Grad Norm: 0.0082 +[2025-02-19 18:18:54] (step=0119000) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0082 +[2025-02-19 18:19:02] (step=0119100) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:19:11] (step=0119200) Train Loss: 0.1528, Train Steps/Sec: 11.62, Grad Norm: 0.0081 +[2025-02-19 18:19:19] (step=0119300) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0081 +[2025-02-19 18:19:28] (step=0119400) Train Loss: 0.1528, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:19:37] (step=0119500) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:19:45] (step=0119600) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:19:54] (step=0119700) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0085 +[2025-02-19 18:20:02] (step=0119800) Train Loss: 0.1529, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:20:11] (step=0119900) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:20:20] (step=0120000) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0081 +[2025-02-19 18:20:21] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0120000.pt +[2025-02-19 18:20:31] (step=0120100) Train Loss: 0.1514, Train Steps/Sec: 9.25, Grad Norm: 0.0082 +[2025-02-19 18:20:39] (step=0120200) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0084 +[2025-02-19 18:20:48] (step=0120300) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0083 +[2025-02-19 18:20:56] (step=0120400) Train Loss: 0.1530, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:21:05] (step=0120500) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0084 +[2025-02-19 18:21:15] (step=0120600) Train Loss: 0.1527, Train Steps/Sec: 9.47, Grad Norm: 0.0083 +[2025-02-19 18:21:24] (step=0120700) Train Loss: 0.1523, Train Steps/Sec: 11.30, Grad Norm: 0.0082 +[2025-02-19 18:21:33] (step=0120800) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0083 +[2025-02-19 18:21:41] (step=0120900) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0081 +[2025-02-19 18:21:50] (step=0121000) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0082 +[2025-02-19 18:21:59] (step=0121100) Train Loss: 0.1531, Train Steps/Sec: 11.65, Grad Norm: 0.0082 +[2025-02-19 18:22:07] (step=0121200) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 18:22:16] (step=0121300) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0082 +[2025-02-19 18:22:26] (step=0121400) Train Loss: 0.1528, Train Steps/Sec: 10.08, Grad Norm: 0.0083 +[2025-02-19 18:22:34] (step=0121500) Train Loss: 0.1528, Train Steps/Sec: 11.65, Grad Norm: 0.0082 +[2025-02-19 18:22:43] (step=0121600) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0084 +[2025-02-19 18:22:51] (step=0121700) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0081 +[2025-02-19 18:23:00] (step=0121800) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:23:09] (step=0121900) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 18:23:17] (step=0122000) Train Loss: 0.1533, Train Steps/Sec: 11.64, Grad Norm: 0.0081 +[2025-02-19 18:23:26] (step=0122100) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:23:34] (step=0122200) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0082 +[2025-02-19 18:23:43] (step=0122300) Train Loss: 0.1527, Train Steps/Sec: 11.65, Grad Norm: 0.0080 +[2025-02-19 18:23:52] (step=0122400) Train Loss: 0.1524, Train Steps/Sec: 11.65, Grad Norm: 0.0081 +[2025-02-19 18:24:00] (step=0122500) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0083 +[2025-02-19 18:24:10] (step=0122600) Train Loss: 0.1524, Train Steps/Sec: 10.08, Grad Norm: 0.0083 +[2025-02-19 18:24:20] (step=0122700) Train Loss: 0.1524, Train Steps/Sec: 10.58, Grad Norm: 0.0082 +[2025-02-19 18:24:29] (step=0122800) Train Loss: 0.1536, Train Steps/Sec: 10.29, Grad Norm: 0.0082 +[2025-02-19 18:24:38] (step=0122900) Train Loss: 0.1518, Train Steps/Sec: 11.29, Grad Norm: 0.0081 +[2025-02-19 18:24:47] (step=0123000) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:24:55] (step=0123100) Train Loss: 0.1531, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:25:04] (step=0123200) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 18:25:13] (step=0123300) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:25:21] (step=0123400) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:25:30] (step=0123500) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:25:38] (step=0123600) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:25:47] (step=0123700) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0081 +[2025-02-19 18:25:56] (step=0123800) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:26:06] (step=0123900) Train Loss: 0.1506, Train Steps/Sec: 9.98, Grad Norm: 0.0083 +[2025-02-19 18:26:14] (step=0124000) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:26:23] (step=0124100) Train Loss: 0.1534, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 18:26:31] (step=0124200) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0085 +[2025-02-19 18:26:40] (step=0124300) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0082 +[2025-02-19 18:26:49] (step=0124400) Train Loss: 0.1523, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:26:57] (step=0124500) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:27:06] (step=0124600) Train Loss: 0.1536, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:27:14] (step=0124700) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0081 +[2025-02-19 18:27:23] (step=0124800) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:27:33] (step=0124900) Train Loss: 0.1523, Train Steps/Sec: 9.97, Grad Norm: 0.0079 +[2025-02-19 18:27:42] (step=0125000) Train Loss: 0.1512, Train Steps/Sec: 10.93, Grad Norm: 0.0083 +[2025-02-19 18:27:52] (step=0125100) Train Loss: 0.1517, Train Steps/Sec: 10.50, Grad Norm: 0.0083 +[2025-02-19 18:28:01] (step=0125200) Train Loss: 0.1533, Train Steps/Sec: 10.65, Grad Norm: 0.0084 +[2025-02-19 18:28:10] (step=0125300) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0080 +[2025-02-19 18:28:18] (step=0125400) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0082 +[2025-02-19 18:28:27] (step=0125500) Train Loss: 0.1526, Train Steps/Sec: 11.60, Grad Norm: 0.0080 +[2025-02-19 18:28:36] (step=0125600) Train Loss: 0.1516, Train Steps/Sec: 11.59, Grad Norm: 0.0080 +[2025-02-19 18:28:44] (step=0125700) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0081 +[2025-02-19 18:28:53] (step=0125800) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0079 +[2025-02-19 18:29:01] (step=0125900) Train Loss: 0.1531, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:29:10] (step=0126000) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0084 +[2025-02-19 18:29:19] (step=0126100) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:29:27] (step=0126200) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:29:36] (step=0126300) Train Loss: 0.1532, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:29:46] (step=0126400) Train Loss: 0.1527, Train Steps/Sec: 10.04, Grad Norm: 0.0081 +[2025-02-19 18:29:54] (step=0126500) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:30:03] (step=0126600) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:30:12] (step=0126700) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0081 +[2025-02-19 18:30:20] (step=0126800) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:30:29] (step=0126900) Train Loss: 0.1532, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:30:37] (step=0127000) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:30:48] (step=0127100) Train Loss: 0.1524, Train Steps/Sec: 9.72, Grad Norm: 0.0082 +[2025-02-19 18:30:57] (step=0127200) Train Loss: 0.1533, Train Steps/Sec: 10.91, Grad Norm: 0.0081 +[2025-02-19 18:31:06] (step=0127300) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:31:14] (step=0127400) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:31:23] (step=0127500) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:31:31] (step=0127600) Train Loss: 0.1522, Train Steps/Sec: 11.59, Grad Norm: 0.0083 +[2025-02-19 18:31:41] (step=0127700) Train Loss: 0.1526, Train Steps/Sec: 10.07, Grad Norm: 0.0081 +[2025-02-19 18:31:50] (step=0127800) Train Loss: 0.1531, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:31:59] (step=0127900) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:32:07] (step=0128000) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:32:16] (step=0128100) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:32:24] (step=0128200) Train Loss: 0.1538, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:32:33] (step=0128300) Train Loss: 0.1523, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:32:42] (step=0128400) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0081 +[2025-02-19 18:32:50] (step=0128500) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:32:59] (step=0128600) Train Loss: 0.1535, Train Steps/Sec: 11.64, Grad Norm: 0.0084 +[2025-02-19 18:33:07] (step=0128700) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:33:16] (step=0128800) Train Loss: 0.1527, Train Steps/Sec: 11.64, Grad Norm: 0.0082 +[2025-02-19 18:33:26] (step=0128900) Train Loss: 0.1539, Train Steps/Sec: 10.05, Grad Norm: 0.0083 +[2025-02-19 18:33:34] (step=0129000) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:33:43] (step=0129100) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0080 +[2025-02-19 18:33:52] (step=0129200) Train Loss: 0.1513, Train Steps/Sec: 11.23, Grad Norm: 0.0083 +[2025-02-19 18:34:02] (step=0129300) Train Loss: 0.1527, Train Steps/Sec: 9.73, Grad Norm: 0.0082 +[2025-02-19 18:34:11] (step=0129400) Train Loss: 0.1521, Train Steps/Sec: 11.29, Grad Norm: 0.0079 +[2025-02-19 18:34:20] (step=0129500) Train Loss: 0.1523, Train Steps/Sec: 11.62, Grad Norm: 0.0083 +[2025-02-19 18:34:28] (step=0129600) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:34:37] (step=0129700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:34:46] (step=0129800) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 18:34:54] (step=0129900) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 18:35:03] (step=0130000) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:35:11] (step=0130100) Train Loss: 0.1526, Train Steps/Sec: 11.58, Grad Norm: 0.0082 +[2025-02-19 18:35:21] (step=0130200) Train Loss: 0.1519, Train Steps/Sec: 10.05, Grad Norm: 0.0080 +[2025-02-19 18:35:30] (step=0130300) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:35:39] (step=0130400) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 18:35:47] (step=0130500) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:35:56] (step=0130600) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 18:36:04] (step=0130700) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:36:13] (step=0130800) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:36:22] (step=0130900) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:36:30] (step=0131000) Train Loss: 0.1531, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:36:39] (step=0131100) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 18:36:47] (step=0131200) Train Loss: 0.1535, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:36:56] (step=0131300) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0081 +[2025-02-19 18:37:07] (step=0131400) Train Loss: 0.1533, Train Steps/Sec: 9.00, Grad Norm: 0.0078 +[2025-02-19 18:37:17] (step=0131500) Train Loss: 0.1522, Train Steps/Sec: 10.60, Grad Norm: 0.0080 +[2025-02-19 18:37:25] (step=0131600) Train Loss: 0.1511, Train Steps/Sec: 11.27, Grad Norm: 0.0079 +[2025-02-19 18:37:34] (step=0131700) Train Loss: 0.1530, Train Steps/Sec: 11.60, Grad Norm: 0.0078 +[2025-02-19 18:37:43] (step=0131800) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0081 +[2025-02-19 18:37:51] (step=0131900) Train Loss: 0.1526, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:38:00] (step=0132000) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:38:09] (step=0132100) Train Loss: 0.1521, Train Steps/Sec: 11.59, Grad Norm: 0.0083 +[2025-02-19 18:38:17] (step=0132200) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:38:26] (step=0132300) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 18:38:34] (step=0132400) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:38:43] (step=0132500) Train Loss: 0.1524, Train Steps/Sec: 11.61, Grad Norm: 0.0083 +[2025-02-19 18:38:52] (step=0132600) Train Loss: 0.1542, Train Steps/Sec: 11.57, Grad Norm: 0.0082 +[2025-02-19 18:39:02] (step=0132700) Train Loss: 0.1516, Train Steps/Sec: 9.88, Grad Norm: 0.0078 +[2025-02-19 18:39:10] (step=0132800) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:39:19] (step=0132900) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:39:28] (step=0133000) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:39:36] (step=0133100) Train Loss: 0.1526, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:39:45] (step=0133200) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0080 +[2025-02-19 18:39:53] (step=0133300) Train Loss: 0.1534, Train Steps/Sec: 11.59, Grad Norm: 0.0080 +[2025-02-19 18:40:02] (step=0133400) Train Loss: 0.1520, Train Steps/Sec: 11.58, Grad Norm: 0.0079 +[2025-02-19 18:40:11] (step=0133500) Train Loss: 0.1539, Train Steps/Sec: 11.63, Grad Norm: 0.0083 +[2025-02-19 18:40:20] (step=0133600) Train Loss: 0.1529, Train Steps/Sec: 10.26, Grad Norm: 0.0079 +[2025-02-19 18:40:30] (step=0133700) Train Loss: 0.1513, Train Steps/Sec: 10.63, Grad Norm: 0.0079 +[2025-02-19 18:40:39] (step=0133800) Train Loss: 0.1521, Train Steps/Sec: 11.30, Grad Norm: 0.0081 +[2025-02-19 18:40:49] (step=0133900) Train Loss: 0.1524, Train Steps/Sec: 10.01, Grad Norm: 0.0078 +[2025-02-19 18:40:57] (step=0134000) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0081 +[2025-02-19 18:41:06] (step=0134100) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:41:14] (step=0134200) Train Loss: 0.1523, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:41:23] (step=0134300) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:41:32] (step=0134400) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:41:40] (step=0134500) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:41:49] (step=0134600) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0081 +[2025-02-19 18:41:58] (step=0134700) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:42:06] (step=0134800) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:42:15] (step=0134900) Train Loss: 0.1537, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:42:23] (step=0135000) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:42:32] (step=0135100) Train Loss: 0.1525, Train Steps/Sec: 11.58, Grad Norm: 0.0078 +[2025-02-19 18:42:42] (step=0135200) Train Loss: 0.1503, Train Steps/Sec: 10.03, Grad Norm: 0.0082 +[2025-02-19 18:42:51] (step=0135300) Train Loss: 0.1529, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:42:59] (step=0135400) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:43:08] (step=0135500) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:43:16] (step=0135600) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:43:25] (step=0135700) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:43:35] (step=0135800) Train Loss: 0.1532, Train Steps/Sec: 9.70, Grad Norm: 0.0083 +[2025-02-19 18:43:44] (step=0135900) Train Loss: 0.1521, Train Steps/Sec: 10.93, Grad Norm: 0.0079 +[2025-02-19 18:43:53] (step=0136000) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 18:44:02] (step=0136100) Train Loss: 0.1539, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:44:10] (step=0136200) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:44:19] (step=0136300) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 18:44:29] (step=0136400) Train Loss: 0.1530, Train Steps/Sec: 10.00, Grad Norm: 0.0081 +[2025-02-19 18:44:37] (step=0136500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0082 +[2025-02-19 18:44:46] (step=0136600) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:44:55] (step=0136700) Train Loss: 0.1531, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:45:03] (step=0136800) Train Loss: 0.1526, Train Steps/Sec: 11.61, Grad Norm: 0.0077 +[2025-02-19 18:45:12] (step=0136900) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0080 +[2025-02-19 18:45:21] (step=0137000) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:45:29] (step=0137100) Train Loss: 0.1528, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:45:38] (step=0137200) Train Loss: 0.1526, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:45:46] (step=0137300) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 18:45:55] (step=0137400) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:46:04] (step=0137500) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:46:12] (step=0137600) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0078 +[2025-02-19 18:46:22] (step=0137700) Train Loss: 0.1513, Train Steps/Sec: 10.02, Grad Norm: 0.0079 +[2025-02-19 18:46:31] (step=0137800) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 18:46:40] (step=0137900) Train Loss: 0.1517, Train Steps/Sec: 10.92, Grad Norm: 0.0081 +[2025-02-19 18:46:50] (step=0138000) Train Loss: 0.1528, Train Steps/Sec: 9.99, Grad Norm: 0.0079 +[2025-02-19 18:46:59] (step=0138100) Train Loss: 0.1527, Train Steps/Sec: 11.29, Grad Norm: 0.0080 +[2025-02-19 18:47:07] (step=0138200) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:47:16] (step=0138300) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:47:25] (step=0138400) Train Loss: 0.1529, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:47:33] (step=0138500) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:47:42] (step=0138600) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 18:47:50] (step=0138700) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0081 +[2025-02-19 18:47:59] (step=0138800) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:48:09] (step=0138900) Train Loss: 0.1519, Train Steps/Sec: 10.00, Grad Norm: 0.0078 +[2025-02-19 18:48:18] (step=0139000) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:48:26] (step=0139100) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:48:35] (step=0139200) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:48:43] (step=0139300) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 18:48:52] (step=0139400) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 18:49:01] (step=0139500) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:49:09] (step=0139600) Train Loss: 0.1531, Train Steps/Sec: 11.63, Grad Norm: 0.0082 +[2025-02-19 18:49:18] (step=0139700) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:49:26] (step=0139800) Train Loss: 0.1526, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:49:35] (step=0139900) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 18:49:44] (step=0140000) Train Loss: 0.1525, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:49:45] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0140000.pt +[2025-02-19 18:49:55] (step=0140100) Train Loss: 0.1515, Train Steps/Sec: 9.11, Grad Norm: 0.0076 +[2025-02-19 18:50:05] (step=0140200) Train Loss: 0.1507, Train Steps/Sec: 9.28, Grad Norm: 0.0081 +[2025-02-19 18:50:14] (step=0140300) Train Loss: 0.1515, Train Steps/Sec: 11.28, Grad Norm: 0.0078 +[2025-02-19 18:50:23] (step=0140400) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:50:32] (step=0140500) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0077 +[2025-02-19 18:50:40] (step=0140600) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 18:50:49] (step=0140700) Train Loss: 0.1523, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 18:50:57] (step=0140800) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0081 +[2025-02-19 18:51:06] (step=0140900) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:51:15] (step=0141000) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:51:23] (step=0141100) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0077 +[2025-02-19 18:51:32] (step=0141200) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:51:40] (step=0141300) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:51:50] (step=0141400) Train Loss: 0.1506, Train Steps/Sec: 9.97, Grad Norm: 0.0077 +[2025-02-19 18:51:59] (step=0141500) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0076 +[2025-02-19 18:52:08] (step=0141600) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0076 +[2025-02-19 18:52:16] (step=0141700) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 18:52:25] (step=0141800) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 18:52:33] (step=0141900) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0077 +[2025-02-19 18:52:42] (step=0142000) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0076 +[2025-02-19 18:52:51] (step=0142100) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0084 +[2025-02-19 18:52:59] (step=0142200) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 18:53:09] (step=0142300) Train Loss: 0.1520, Train Steps/Sec: 9.98, Grad Norm: 0.0078 +[2025-02-19 18:53:18] (step=0142400) Train Loss: 0.1513, Train Steps/Sec: 10.89, Grad Norm: 0.0080 +[2025-02-19 18:53:27] (step=0142500) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0079 +[2025-02-19 18:53:36] (step=0142600) Train Loss: 0.1512, Train Steps/Sec: 11.56, Grad Norm: 0.0078 +[2025-02-19 18:53:46] (step=0142700) Train Loss: 0.1518, Train Steps/Sec: 10.06, Grad Norm: 0.0078 +[2025-02-19 18:53:54] (step=0142800) Train Loss: 0.1528, Train Steps/Sec: 11.60, Grad Norm: 0.0082 +[2025-02-19 18:54:03] (step=0142900) Train Loss: 0.1523, Train Steps/Sec: 11.59, Grad Norm: 0.0079 +[2025-02-19 18:54:12] (step=0143000) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0077 +[2025-02-19 18:54:20] (step=0143100) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 18:54:29] (step=0143200) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0076 +[2025-02-19 18:54:38] (step=0143300) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0077 +[2025-02-19 18:54:46] (step=0143400) Train Loss: 0.1529, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:54:55] (step=0143500) Train Loss: 0.1521, Train Steps/Sec: 11.61, Grad Norm: 0.0079 +[2025-02-19 18:55:03] (step=0143600) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0079 +[2025-02-19 18:55:12] (step=0143700) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0081 +[2025-02-19 18:55:21] (step=0143800) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0076 +[2025-02-19 18:55:31] (step=0143900) Train Loss: 0.1538, Train Steps/Sec: 10.04, Grad Norm: 0.0076 +[2025-02-19 18:55:39] (step=0144000) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0075 +[2025-02-19 18:55:48] (step=0144100) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0078 +[2025-02-19 18:55:57] (step=0144200) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0075 +[2025-02-19 18:56:05] (step=0144300) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0080 +[2025-02-19 18:56:14] (step=0144400) Train Loss: 0.1521, Train Steps/Sec: 11.23, Grad Norm: 0.0078 +[2025-02-19 18:56:24] (step=0144500) Train Loss: 0.1517, Train Steps/Sec: 9.75, Grad Norm: 0.0079 +[2025-02-19 18:56:33] (step=0144600) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0075 +[2025-02-19 18:56:42] (step=0144700) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0080 +[2025-02-19 18:56:50] (step=0144800) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:56:59] (step=0144900) Train Loss: 0.1526, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 18:57:08] (step=0145000) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 18:57:16] (step=0145100) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0079 +[2025-02-19 18:57:26] (step=0145200) Train Loss: 0.1519, Train Steps/Sec: 10.01, Grad Norm: 0.0077 +[2025-02-19 18:57:35] (step=0145300) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0076 +[2025-02-19 18:57:43] (step=0145400) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:57:52] (step=0145500) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:58:01] (step=0145600) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 18:58:09] (step=0145700) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:58:18] (step=0145800) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 18:58:26] (step=0145900) Train Loss: 0.1531, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 18:58:35] (step=0146000) Train Loss: 0.1527, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 18:58:44] (step=0146100) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0075 +[2025-02-19 18:58:52] (step=0146200) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 18:59:01] (step=0146300) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0080 +[2025-02-19 18:59:11] (step=0146400) Train Loss: 0.1518, Train Steps/Sec: 10.06, Grad Norm: 0.0077 +[2025-02-19 18:59:19] (step=0146500) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0079 +[2025-02-19 18:59:29] (step=0146600) Train Loss: 0.1525, Train Steps/Sec: 10.58, Grad Norm: 0.0077 +[2025-02-19 18:59:38] (step=0146700) Train Loss: 0.1513, Train Steps/Sec: 10.30, Grad Norm: 0.0081 +[2025-02-19 18:59:47] (step=0146800) Train Loss: 0.1518, Train Steps/Sec: 11.30, Grad Norm: 0.0077 +[2025-02-19 18:59:56] (step=0146900) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:00:05] (step=0147000) Train Loss: 0.1529, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:00:13] (step=0147100) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:00:22] (step=0147200) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:00:30] (step=0147300) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0081 +[2025-02-19 19:00:39] (step=0147400) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:00:47] (step=0147500) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:00:56] (step=0147600) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0077 +[2025-02-19 19:01:06] (step=0147700) Train Loss: 0.1536, Train Steps/Sec: 10.02, Grad Norm: 0.0077 +[2025-02-19 19:01:15] (step=0147800) Train Loss: 0.1519, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:01:23] (step=0147900) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:01:32] (step=0148000) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 19:01:40] (step=0148100) Train Loss: 0.1528, Train Steps/Sec: 11.65, Grad Norm: 0.0079 +[2025-02-19 19:01:49] (step=0148200) Train Loss: 0.1526, Train Steps/Sec: 11.66, Grad Norm: 0.0079 +[2025-02-19 19:01:58] (step=0148300) Train Loss: 0.1530, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:02:06] (step=0148400) Train Loss: 0.1527, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:02:15] (step=0148500) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0076 +[2025-02-19 19:02:23] (step=0148600) Train Loss: 0.1538, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:02:32] (step=0148700) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0079 +[2025-02-19 19:02:42] (step=0148800) Train Loss: 0.1515, Train Steps/Sec: 10.28, Grad Norm: 0.0077 +[2025-02-19 19:02:53] (step=0148900) Train Loss: 0.1524, Train Steps/Sec: 9.23, Grad Norm: 0.0075 +[2025-02-19 19:03:01] (step=0149000) Train Loss: 0.1514, Train Steps/Sec: 11.27, Grad Norm: 0.0076 +[2025-02-19 19:03:10] (step=0149100) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 19:03:19] (step=0149200) Train Loss: 0.1529, Train Steps/Sec: 11.60, Grad Norm: 0.0078 +[2025-02-19 19:03:27] (step=0149300) Train Loss: 0.1531, Train Steps/Sec: 11.60, Grad Norm: 0.0077 +[2025-02-19 19:03:36] (step=0149400) Train Loss: 0.1521, Train Steps/Sec: 11.59, Grad Norm: 0.0077 +[2025-02-19 19:03:44] (step=0149500) Train Loss: 0.1532, Train Steps/Sec: 11.59, Grad Norm: 0.0082 +[2025-02-19 19:03:53] (step=0149600) Train Loss: 0.1523, Train Steps/Sec: 11.59, Grad Norm: 0.0077 +[2025-02-19 19:04:02] (step=0149700) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0076 +[2025-02-19 19:04:10] (step=0149800) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:04:19] (step=0149900) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0080 +[2025-02-19 19:04:28] (step=0150000) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:04:36] (step=0150100) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0079 +[2025-02-19 19:04:46] (step=0150200) Train Loss: 0.1514, Train Steps/Sec: 10.08, Grad Norm: 0.0077 +[2025-02-19 19:04:55] (step=0150300) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:05:03] (step=0150400) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 19:05:12] (step=0150500) Train Loss: 0.1528, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:05:20] (step=0150600) Train Loss: 0.1532, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:05:29] (step=0150700) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 19:05:38] (step=0150800) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0075 +[2025-02-19 19:05:46] (step=0150900) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:05:56] (step=0151000) Train Loss: 0.1510, Train Steps/Sec: 10.02, Grad Norm: 0.0079 +[2025-02-19 19:06:05] (step=0151100) Train Loss: 0.1515, Train Steps/Sec: 10.93, Grad Norm: 0.0074 +[2025-02-19 19:06:14] (step=0151200) Train Loss: 0.1518, Train Steps/Sec: 11.31, Grad Norm: 0.0078 +[2025-02-19 19:06:23] (step=0151300) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 19:06:33] (step=0151400) Train Loss: 0.1517, Train Steps/Sec: 10.01, Grad Norm: 0.0077 +[2025-02-19 19:06:41] (step=0151500) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:06:50] (step=0151600) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:06:59] (step=0151700) Train Loss: 0.1531, Train Steps/Sec: 11.61, Grad Norm: 0.0078 +[2025-02-19 19:07:07] (step=0151800) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 19:07:16] (step=0151900) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:07:24] (step=0152000) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 19:07:33] (step=0152100) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0076 +[2025-02-19 19:07:42] (step=0152200) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:07:50] (step=0152300) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0078 +[2025-02-19 19:07:59] (step=0152400) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 19:08:07] (step=0152500) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:08:16] (step=0152600) Train Loss: 0.1525, Train Steps/Sec: 11.58, Grad Norm: 0.0074 +[2025-02-19 19:08:26] (step=0152700) Train Loss: 0.1517, Train Steps/Sec: 10.06, Grad Norm: 0.0078 +[2025-02-19 19:08:35] (step=0152800) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0076 +[2025-02-19 19:08:43] (step=0152900) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0076 +[2025-02-19 19:08:52] (step=0153000) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0080 +[2025-02-19 19:09:01] (step=0153100) Train Loss: 0.1517, Train Steps/Sec: 11.25, Grad Norm: 0.0074 +[2025-02-19 19:09:11] (step=0153200) Train Loss: 0.1523, Train Steps/Sec: 10.01, Grad Norm: 0.0075 +[2025-02-19 19:09:20] (step=0153300) Train Loss: 0.1523, Train Steps/Sec: 10.94, Grad Norm: 0.0077 +[2025-02-19 19:09:28] (step=0153400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:09:37] (step=0153500) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0075 +[2025-02-19 19:09:46] (step=0153600) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0076 +[2025-02-19 19:09:54] (step=0153700) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-19 19:10:03] (step=0153800) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0079 +[2025-02-19 19:10:13] (step=0153900) Train Loss: 0.1510, Train Steps/Sec: 10.05, Grad Norm: 0.0079 +[2025-02-19 19:10:21] (step=0154000) Train Loss: 0.1517, Train Steps/Sec: 11.66, Grad Norm: 0.0080 +[2025-02-19 19:10:30] (step=0154100) Train Loss: 0.1530, Train Steps/Sec: 11.67, Grad Norm: 0.0078 +[2025-02-19 19:10:38] (step=0154200) Train Loss: 0.1537, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 19:10:47] (step=0154300) Train Loss: 0.1509, Train Steps/Sec: 11.67, Grad Norm: 0.0079 +[2025-02-19 19:10:56] (step=0154400) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0079 +[2025-02-19 19:11:04] (step=0154500) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 19:11:13] (step=0154600) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 19:11:21] (step=0154700) Train Loss: 0.1528, Train Steps/Sec: 11.65, Grad Norm: 0.0080 +[2025-02-19 19:11:30] (step=0154800) Train Loss: 0.1524, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:11:39] (step=0154900) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 19:11:47] (step=0155000) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:11:56] (step=0155100) Train Loss: 0.1524, Train Steps/Sec: 11.60, Grad Norm: 0.0077 +[2025-02-19 19:12:06] (step=0155200) Train Loss: 0.1512, Train Steps/Sec: 10.05, Grad Norm: 0.0075 +[2025-02-19 19:12:15] (step=0155300) Train Loss: 0.1523, Train Steps/Sec: 10.26, Grad Norm: 0.0078 +[2025-02-19 19:12:25] (step=0155400) Train Loss: 0.1514, Train Steps/Sec: 10.92, Grad Norm: 0.0077 +[2025-02-19 19:12:34] (step=0155500) Train Loss: 0.1510, Train Steps/Sec: 10.96, Grad Norm: 0.0077 +[2025-02-19 19:12:42] (step=0155600) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:12:51] (step=0155700) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0079 +[2025-02-19 19:13:00] (step=0155800) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:13:08] (step=0155900) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 19:13:17] (step=0156000) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:13:25] (step=0156100) Train Loss: 0.1531, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:13:34] (step=0156200) Train Loss: 0.1529, Train Steps/Sec: 11.66, Grad Norm: 0.0078 +[2025-02-19 19:13:42] (step=0156300) Train Loss: 0.1516, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 19:13:52] (step=0156400) Train Loss: 0.1514, Train Steps/Sec: 10.02, Grad Norm: 0.0077 +[2025-02-19 19:14:01] (step=0156500) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0080 +[2025-02-19 19:14:10] (step=0156600) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0075 +[2025-02-19 19:14:18] (step=0156700) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:14:27] (step=0156800) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:14:35] (step=0156900) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 19:14:44] (step=0157000) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0081 +[2025-02-19 19:14:53] (step=0157100) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:15:01] (step=0157200) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:15:10] (step=0157300) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:15:18] (step=0157400) Train Loss: 0.1520, Train Steps/Sec: 11.67, Grad Norm: 0.0078 +[2025-02-19 19:15:28] (step=0157500) Train Loss: 0.1514, Train Steps/Sec: 10.29, Grad Norm: 0.0075 +[2025-02-19 19:15:37] (step=0157600) Train Loss: 0.1516, Train Steps/Sec: 10.59, Grad Norm: 0.0078 +[2025-02-19 19:15:48] (step=0157700) Train Loss: 0.1518, Train Steps/Sec: 9.81, Grad Norm: 0.0076 +[2025-02-19 19:15:56] (step=0157800) Train Loss: 0.1524, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:16:05] (step=0157900) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 19:16:13] (step=0158000) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0079 +[2025-02-19 19:16:22] (step=0158100) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:16:31] (step=0158200) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:16:39] (step=0158300) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:16:48] (step=0158400) Train Loss: 0.1524, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:16:56] (step=0158500) Train Loss: 0.1521, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 19:17:05] (step=0158600) Train Loss: 0.1517, Train Steps/Sec: 11.67, Grad Norm: 0.0078 +[2025-02-19 19:17:13] (step=0158700) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0075 +[2025-02-19 19:17:22] (step=0158800) Train Loss: 0.1513, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 19:17:32] (step=0158900) Train Loss: 0.1517, Train Steps/Sec: 9.98, Grad Norm: 0.0079 +[2025-02-19 19:17:41] (step=0159000) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:17:49] (step=0159100) Train Loss: 0.1524, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:17:58] (step=0159200) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:18:06] (step=0159300) Train Loss: 0.1513, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:18:15] (step=0159400) Train Loss: 0.1505, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:18:24] (step=0159500) Train Loss: 0.1532, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:18:32] (step=0159600) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0078 +[2025-02-19 19:18:42] (step=0159700) Train Loss: 0.1508, Train Steps/Sec: 9.75, Grad Norm: 0.0075 +[2025-02-19 19:18:51] (step=0159800) Train Loss: 0.1521, Train Steps/Sec: 11.32, Grad Norm: 0.0074 +[2025-02-19 19:19:00] (step=0159900) Train Loss: 0.1517, Train Steps/Sec: 11.30, Grad Norm: 0.0077 +[2025-02-19 19:19:09] (step=0160000) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:19:10] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0160000.pt +[2025-02-19 19:19:18] (step=0160100) Train Loss: 0.1522, Train Steps/Sec: 10.24, Grad Norm: 0.0075 +[2025-02-19 19:19:28] (step=0160200) Train Loss: 0.1525, Train Steps/Sec: 9.93, Grad Norm: 0.0075 +[2025-02-19 19:19:37] (step=0160300) Train Loss: 0.1528, Train Steps/Sec: 11.62, Grad Norm: 0.0077 +[2025-02-19 19:19:46] (step=0160400) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0075 +[2025-02-19 19:19:54] (step=0160500) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0075 +[2025-02-19 19:20:03] (step=0160600) Train Loss: 0.1531, Train Steps/Sec: 11.63, Grad Norm: 0.0076 +[2025-02-19 19:20:11] (step=0160700) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:20:20] (step=0160800) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:20:29] (step=0160900) Train Loss: 0.1531, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:20:37] (step=0161000) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 19:20:46] (step=0161100) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:20:54] (step=0161200) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:21:03] (step=0161300) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:21:13] (step=0161400) Train Loss: 0.1506, Train Steps/Sec: 9.77, Grad Norm: 0.0074 +[2025-02-19 19:21:22] (step=0161500) Train Loss: 0.1526, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:21:30] (step=0161600) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0078 +[2025-02-19 19:21:39] (step=0161700) Train Loss: 0.1532, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:21:48] (step=0161800) Train Loss: 0.1528, Train Steps/Sec: 10.94, Grad Norm: 0.0077 +[2025-02-19 19:21:58] (step=0161900) Train Loss: 0.1531, Train Steps/Sec: 10.27, Grad Norm: 0.0078 +[2025-02-19 19:22:07] (step=0162000) Train Loss: 0.1524, Train Steps/Sec: 10.98, Grad Norm: 0.0077 +[2025-02-19 19:22:16] (step=0162100) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:22:24] (step=0162200) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:22:33] (step=0162300) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 19:22:41] (step=0162400) Train Loss: 0.1524, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:22:50] (step=0162500) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:22:59] (step=0162600) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0075 +[2025-02-19 19:23:08] (step=0162700) Train Loss: 0.1517, Train Steps/Sec: 10.08, Grad Norm: 0.0076 +[2025-02-19 19:23:17] (step=0162800) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 19:23:26] (step=0162900) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:23:34] (step=0163000) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:23:43] (step=0163100) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:23:51] (step=0163200) Train Loss: 0.1528, Train Steps/Sec: 11.65, Grad Norm: 0.0079 +[2025-02-19 19:24:00] (step=0163300) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:24:09] (step=0163400) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:24:17] (step=0163500) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0079 +[2025-02-19 19:24:26] (step=0163600) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:24:34] (step=0163700) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:24:43] (step=0163800) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:24:53] (step=0163900) Train Loss: 0.1512, Train Steps/Sec: 10.05, Grad Norm: 0.0075 +[2025-02-19 19:25:03] (step=0164000) Train Loss: 0.1510, Train Steps/Sec: 10.29, Grad Norm: 0.0074 +[2025-02-19 19:25:12] (step=0164100) Train Loss: 0.1528, Train Steps/Sec: 10.65, Grad Norm: 0.0074 +[2025-02-19 19:25:21] (step=0164200) Train Loss: 0.1517, Train Steps/Sec: 11.32, Grad Norm: 0.0074 +[2025-02-19 19:25:29] (step=0164300) Train Loss: 0.1510, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 19:25:38] (step=0164400) Train Loss: 0.1529, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 19:25:47] (step=0164500) Train Loss: 0.1506, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:25:55] (step=0164600) Train Loss: 0.1511, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 19:26:04] (step=0164700) Train Loss: 0.1512, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:26:12] (step=0164800) Train Loss: 0.1529, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:26:21] (step=0164900) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:26:29] (step=0165000) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0078 +[2025-02-19 19:26:38] (step=0165100) Train Loss: 0.1515, Train Steps/Sec: 11.59, Grad Norm: 0.0077 +[2025-02-19 19:26:48] (step=0165200) Train Loss: 0.1525, Train Steps/Sec: 10.01, Grad Norm: 0.0074 +[2025-02-19 19:26:57] (step=0165300) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:27:05] (step=0165400) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:27:14] (step=0165500) Train Loss: 0.1528, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:27:22] (step=0165600) Train Loss: 0.1522, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:27:31] (step=0165700) Train Loss: 0.1530, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:27:39] (step=0165800) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:27:48] (step=0165900) Train Loss: 0.1517, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:27:57] (step=0166000) Train Loss: 0.1537, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:28:05] (step=0166100) Train Loss: 0.1522, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:28:15] (step=0166200) Train Loss: 0.1535, Train Steps/Sec: 10.00, Grad Norm: 0.0075 +[2025-02-19 19:28:24] (step=0166300) Train Loss: 0.1526, Train Steps/Sec: 10.94, Grad Norm: 0.0078 +[2025-02-19 19:28:35] (step=0166400) Train Loss: 0.1513, Train Steps/Sec: 9.80, Grad Norm: 0.0075 +[2025-02-19 19:28:43] (step=0166500) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:28:52] (step=0166600) Train Loss: 0.1519, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:29:00] (step=0166700) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:29:09] (step=0166800) Train Loss: 0.1530, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:29:17] (step=0166900) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:29:26] (step=0167000) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:29:35] (step=0167100) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0077 +[2025-02-19 19:29:43] (step=0167200) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:29:52] (step=0167300) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:30:00] (step=0167400) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 19:30:09] (step=0167500) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 19:30:18] (step=0167600) Train Loss: 0.1522, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:30:28] (step=0167700) Train Loss: 0.1521, Train Steps/Sec: 10.07, Grad Norm: 0.0076 +[2025-02-19 19:30:36] (step=0167800) Train Loss: 0.1540, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 19:30:45] (step=0167900) Train Loss: 0.1534, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:30:53] (step=0168000) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:31:02] (step=0168100) Train Loss: 0.1531, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:31:10] (step=0168200) Train Loss: 0.1523, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 19:31:19] (step=0168300) Train Loss: 0.1522, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 19:31:29] (step=0168400) Train Loss: 0.1516, Train Steps/Sec: 9.74, Grad Norm: 0.0076 +[2025-02-19 19:31:38] (step=0168500) Train Loss: 0.1520, Train Steps/Sec: 11.31, Grad Norm: 0.0074 +[2025-02-19 19:31:47] (step=0168600) Train Loss: 0.1521, Train Steps/Sec: 11.31, Grad Norm: 0.0075 +[2025-02-19 19:31:56] (step=0168700) Train Loss: 0.1531, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:32:04] (step=0168800) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:32:14] (step=0168900) Train Loss: 0.1515, Train Steps/Sec: 10.04, Grad Norm: 0.0074 +[2025-02-19 19:32:23] (step=0169000) Train Loss: 0.1518, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:32:31] (step=0169100) Train Loss: 0.1512, Train Steps/Sec: 11.68, Grad Norm: 0.0076 +[2025-02-19 19:32:40] (step=0169200) Train Loss: 0.1512, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:32:48] (step=0169300) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 19:32:57] (step=0169400) Train Loss: 0.1512, Train Steps/Sec: 11.68, Grad Norm: 0.0075 +[2025-02-19 19:33:05] (step=0169500) Train Loss: 0.1521, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-19 19:33:14] (step=0169600) Train Loss: 0.1525, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 19:33:23] (step=0169700) Train Loss: 0.1516, Train Steps/Sec: 11.68, Grad Norm: 0.0076 +[2025-02-19 19:33:31] (step=0169800) Train Loss: 0.1526, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-19 19:33:40] (step=0169900) Train Loss: 0.1522, Train Steps/Sec: 11.68, Grad Norm: 0.0078 +[2025-02-19 19:33:48] (step=0170000) Train Loss: 0.1521, Train Steps/Sec: 11.68, Grad Norm: 0.0077 +[2025-02-19 19:33:57] (step=0170100) Train Loss: 0.1518, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-19 19:34:07] (step=0170200) Train Loss: 0.1520, Train Steps/Sec: 10.07, Grad Norm: 0.0075 +[2025-02-19 19:34:15] (step=0170300) Train Loss: 0.1502, Train Steps/Sec: 11.68, Grad Norm: 0.0076 +[2025-02-19 19:34:24] (step=0170400) Train Loss: 0.1494, Train Steps/Sec: 11.69, Grad Norm: 0.0074 +[2025-02-19 19:34:33] (step=0170500) Train Loss: 0.1521, Train Steps/Sec: 10.66, Grad Norm: 0.0072 +[2025-02-19 19:34:43] (step=0170600) Train Loss: 0.1523, Train Steps/Sec: 10.65, Grad Norm: 0.0078 +[2025-02-19 19:34:52] (step=0170700) Train Loss: 0.1522, Train Steps/Sec: 11.03, Grad Norm: 0.0075 +[2025-02-19 19:35:00] (step=0170800) Train Loss: 0.1518, Train Steps/Sec: 11.71, Grad Norm: 0.0075 +[2025-02-19 19:35:09] (step=0170900) Train Loss: 0.1531, Train Steps/Sec: 11.70, Grad Norm: 0.0073 +[2025-02-19 19:35:17] (step=0171000) Train Loss: 0.1504, Train Steps/Sec: 11.70, Grad Norm: 0.0075 +[2025-02-19 19:35:26] (step=0171100) Train Loss: 0.1522, Train Steps/Sec: 11.69, Grad Norm: 0.0075 +[2025-02-19 19:35:35] (step=0171200) Train Loss: 0.1501, Train Steps/Sec: 11.70, Grad Norm: 0.0078 +[2025-02-19 19:35:43] (step=0171300) Train Loss: 0.1510, Train Steps/Sec: 11.70, Grad Norm: 0.0074 +[2025-02-19 19:35:53] (step=0171400) Train Loss: 0.1529, Train Steps/Sec: 10.12, Grad Norm: 0.0073 +[2025-02-19 19:36:02] (step=0171500) Train Loss: 0.1534, Train Steps/Sec: 11.68, Grad Norm: 0.0076 +[2025-02-19 19:36:10] (step=0171600) Train Loss: 0.1532, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:36:19] (step=0171700) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:36:27] (step=0171800) Train Loss: 0.1510, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:36:36] (step=0171900) Train Loss: 0.1522, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:36:44] (step=0172000) Train Loss: 0.1528, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 19:36:53] (step=0172100) Train Loss: 0.1506, Train Steps/Sec: 11.67, Grad Norm: 0.0075 +[2025-02-19 19:37:02] (step=0172200) Train Loss: 0.1518, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:37:10] (step=0172300) Train Loss: 0.1527, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:37:19] (step=0172400) Train Loss: 0.1519, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:37:27] (step=0172500) Train Loss: 0.1505, Train Steps/Sec: 11.68, Grad Norm: 0.0079 +[2025-02-19 19:37:36] (step=0172600) Train Loss: 0.1516, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 19:37:47] (step=0172700) Train Loss: 0.1510, Train Steps/Sec: 8.87, Grad Norm: 0.0072 +[2025-02-19 19:37:56] (step=0172800) Train Loss: 0.1510, Train Steps/Sec: 10.90, Grad Norm: 0.0076 +[2025-02-19 19:38:05] (step=0172900) Train Loss: 0.1511, Train Steps/Sec: 11.22, Grad Norm: 0.0074 +[2025-02-19 19:38:14] (step=0173000) Train Loss: 0.1515, Train Steps/Sec: 11.59, Grad Norm: 0.0075 +[2025-02-19 19:38:22] (step=0173100) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0073 +[2025-02-19 19:38:31] (step=0173200) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0076 +[2025-02-19 19:38:40] (step=0173300) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0076 +[2025-02-19 19:38:48] (step=0173400) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0077 +[2025-02-19 19:38:57] (step=0173500) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0073 +[2025-02-19 19:39:06] (step=0173600) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0081 +[2025-02-19 19:39:14] (step=0173700) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0075 +[2025-02-19 19:39:23] (step=0173800) Train Loss: 0.1529, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 19:39:33] (step=0173900) Train Loss: 0.1525, Train Steps/Sec: 9.93, Grad Norm: 0.0073 +[2025-02-19 19:39:41] (step=0174000) Train Loss: 0.1509, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 19:39:50] (step=0174100) Train Loss: 0.1518, Train Steps/Sec: 11.68, Grad Norm: 0.0075 +[2025-02-19 19:39:58] (step=0174200) Train Loss: 0.1520, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:40:07] (step=0174300) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:40:16] (step=0174400) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:40:24] (step=0174500) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:40:33] (step=0174600) Train Loss: 0.1531, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:40:41] (step=0174700) Train Loss: 0.1526, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:40:50] (step=0174800) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:41:00] (step=0174900) Train Loss: 0.1515, Train Steps/Sec: 9.99, Grad Norm: 0.0076 +[2025-02-19 19:41:09] (step=0175000) Train Loss: 0.1505, Train Steps/Sec: 10.93, Grad Norm: 0.0075 +[2025-02-19 19:41:18] (step=0175100) Train Loss: 0.1512, Train Steps/Sec: 11.29, Grad Norm: 0.0073 +[2025-02-19 19:41:28] (step=0175200) Train Loss: 0.1516, Train Steps/Sec: 9.97, Grad Norm: 0.0074 +[2025-02-19 19:41:37] (step=0175300) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:41:45] (step=0175400) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:41:54] (step=0175500) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:42:02] (step=0175600) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:42:11] (step=0175700) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:42:20] (step=0175800) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0075 +[2025-02-19 19:42:28] (step=0175900) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 19:42:37] (step=0176000) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 19:42:45] (step=0176100) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0077 +[2025-02-19 19:42:54] (step=0176200) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 19:43:03] (step=0176300) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:43:12] (step=0176400) Train Loss: 0.1512, Train Steps/Sec: 10.11, Grad Norm: 0.0075 +[2025-02-19 19:43:21] (step=0176500) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:43:30] (step=0176600) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:43:38] (step=0176700) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:43:47] (step=0176800) Train Loss: 0.1517, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:43:55] (step=0176900) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0074 +[2025-02-19 19:44:04] (step=0177000) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 19:44:14] (step=0177100) Train Loss: 0.1503, Train Steps/Sec: 9.74, Grad Norm: 0.0073 +[2025-02-19 19:44:23] (step=0177200) Train Loss: 0.1519, Train Steps/Sec: 11.31, Grad Norm: 0.0075 +[2025-02-19 19:44:32] (step=0177300) Train Loss: 0.1518, Train Steps/Sec: 11.32, Grad Norm: 0.0073 +[2025-02-19 19:44:41] (step=0177400) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:44:49] (step=0177500) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 19:44:58] (step=0177600) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:45:08] (step=0177700) Train Loss: 0.1513, Train Steps/Sec: 10.06, Grad Norm: 0.0075 +[2025-02-19 19:45:16] (step=0177800) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:45:25] (step=0177900) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:45:33] (step=0178000) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 19:45:42] (step=0178100) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 19:45:51] (step=0178200) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:45:59] (step=0178300) Train Loss: 0.1521, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:46:08] (step=0178400) Train Loss: 0.1523, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:46:16] (step=0178500) Train Loss: 0.1519, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:46:25] (step=0178600) Train Loss: 0.1530, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:46:33] (step=0178700) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:46:42] (step=0178800) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:46:52] (step=0178900) Train Loss: 0.1516, Train Steps/Sec: 10.11, Grad Norm: 0.0074 +[2025-02-19 19:47:01] (step=0179000) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0076 +[2025-02-19 19:47:09] (step=0179100) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0075 +[2025-02-19 19:47:19] (step=0179200) Train Loss: 0.1519, Train Steps/Sec: 10.57, Grad Norm: 0.0075 +[2025-02-19 19:47:28] (step=0179300) Train Loss: 0.1534, Train Steps/Sec: 10.58, Grad Norm: 0.0074 +[2025-02-19 19:47:37] (step=0179400) Train Loss: 0.1515, Train Steps/Sec: 10.95, Grad Norm: 0.0073 +[2025-02-19 19:47:46] (step=0179500) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0075 +[2025-02-19 19:47:54] (step=0179600) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0076 +[2025-02-19 19:48:03] (step=0179700) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:48:12] (step=0179800) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:48:20] (step=0179900) Train Loss: 0.1530, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:48:29] (step=0180000) Train Loss: 0.1528, Train Steps/Sec: 11.66, Grad Norm: 0.0077 +[2025-02-19 19:48:30] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0180000.pt +[2025-02-19 19:48:38] (step=0180100) Train Loss: 0.1513, Train Steps/Sec: 10.23, Grad Norm: 0.0076 +[2025-02-19 19:48:48] (step=0180200) Train Loss: 0.1516, Train Steps/Sec: 10.00, Grad Norm: 0.0073 +[2025-02-19 19:48:57] (step=0180300) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:49:06] (step=0180400) Train Loss: 0.1501, Train Steps/Sec: 11.67, Grad Norm: 0.0075 +[2025-02-19 19:49:14] (step=0180500) Train Loss: 0.1509, Train Steps/Sec: 11.68, Grad Norm: 0.0072 +[2025-02-19 19:49:23] (step=0180600) Train Loss: 0.1519, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 19:49:31] (step=0180700) Train Loss: 0.1516, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-19 19:49:40] (step=0180800) Train Loss: 0.1519, Train Steps/Sec: 11.59, Grad Norm: 0.0075 +[2025-02-19 19:49:49] (step=0180900) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 19:49:57] (step=0181000) Train Loss: 0.1518, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-19 19:50:06] (step=0181100) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:50:14] (step=0181200) Train Loss: 0.1527, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:50:23] (step=0181300) Train Loss: 0.1513, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 19:50:34] (step=0181400) Train Loss: 0.1513, Train Steps/Sec: 8.79, Grad Norm: 0.0074 +[2025-02-19 19:50:43] (step=0181500) Train Loss: 0.1513, Train Steps/Sec: 11.26, Grad Norm: 0.0073 +[2025-02-19 19:50:52] (step=0181600) Train Loss: 0.1515, Train Steps/Sec: 10.97, Grad Norm: 0.0072 +[2025-02-19 19:51:01] (step=0181700) Train Loss: 0.1499, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:51:09] (step=0181800) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 19:51:18] (step=0181900) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:51:27] (step=0182000) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 19:51:35] (step=0182100) Train Loss: 0.1519, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:51:44] (step=0182200) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:51:52] (step=0182300) Train Loss: 0.1519, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 19:52:01] (step=0182400) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 19:52:10] (step=0182500) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0076 +[2025-02-19 19:52:18] (step=0182600) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 19:52:28] (step=0182700) Train Loss: 0.1500, Train Steps/Sec: 9.96, Grad Norm: 0.0076 +[2025-02-19 19:52:37] (step=0182800) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0074 +[2025-02-19 19:52:45] (step=0182900) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0074 +[2025-02-19 19:52:54] (step=0183000) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0074 +[2025-02-19 19:53:03] (step=0183100) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0075 +[2025-02-19 19:53:11] (step=0183200) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0073 +[2025-02-19 19:53:20] (step=0183300) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0073 +[2025-02-19 19:53:28] (step=0183400) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0076 +[2025-02-19 19:53:37] (step=0183500) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0075 +[2025-02-19 19:53:47] (step=0183600) Train Loss: 0.1518, Train Steps/Sec: 9.99, Grad Norm: 0.0073 +[2025-02-19 19:53:56] (step=0183700) Train Loss: 0.1504, Train Steps/Sec: 10.90, Grad Norm: 0.0071 +[2025-02-19 19:54:05] (step=0183800) Train Loss: 0.1517, Train Steps/Sec: 11.26, Grad Norm: 0.0073 +[2025-02-19 19:54:15] (step=0183900) Train Loss: 0.1518, Train Steps/Sec: 10.07, Grad Norm: 0.0073 +[2025-02-19 19:54:24] (step=0184000) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0074 +[2025-02-19 19:54:32] (step=0184100) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0072 +[2025-02-19 19:54:41] (step=0184200) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0073 +[2025-02-19 19:54:50] (step=0184300) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0078 +[2025-02-19 19:54:58] (step=0184400) Train Loss: 0.1501, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:55:07] (step=0184500) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0078 +[2025-02-19 19:55:15] (step=0184600) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 19:55:24] (step=0184700) Train Loss: 0.1501, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 19:55:32] (step=0184800) Train Loss: 0.1519, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 19:55:41] (step=0184900) Train Loss: 0.1530, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 19:55:50] (step=0185000) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:55:58] (step=0185100) Train Loss: 0.1527, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 19:56:08] (step=0185200) Train Loss: 0.1519, Train Steps/Sec: 10.09, Grad Norm: 0.0072 +[2025-02-19 19:56:17] (step=0185300) Train Loss: 0.1527, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:56:25] (step=0185400) Train Loss: 0.1531, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 19:56:34] (step=0185500) Train Loss: 0.1524, Train Steps/Sec: 11.65, Grad Norm: 0.0078 +[2025-02-19 19:56:42] (step=0185600) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 19:56:52] (step=0185700) Train Loss: 0.1529, Train Steps/Sec: 10.96, Grad Norm: 0.0074 +[2025-02-19 19:57:01] (step=0185800) Train Loss: 0.1514, Train Steps/Sec: 10.29, Grad Norm: 0.0077 +[2025-02-19 19:57:10] (step=0185900) Train Loss: 0.1511, Train Steps/Sec: 11.31, Grad Norm: 0.0072 +[2025-02-19 19:57:19] (step=0186000) Train Loss: 0.1509, Train Steps/Sec: 11.33, Grad Norm: 0.0075 +[2025-02-19 19:57:27] (step=0186100) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:57:36] (step=0186200) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:57:45] (step=0186300) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 19:57:55] (step=0186400) Train Loss: 0.1525, Train Steps/Sec: 10.00, Grad Norm: 0.0074 +[2025-02-19 19:58:03] (step=0186500) Train Loss: 0.1533, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 19:58:12] (step=0186600) Train Loss: 0.1524, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:58:20] (step=0186700) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0074 +[2025-02-19 19:58:29] (step=0186800) Train Loss: 0.1528, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 19:58:38] (step=0186900) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:58:46] (step=0187000) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 19:58:55] (step=0187100) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 19:59:03] (step=0187200) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 19:59:12] (step=0187300) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:59:20] (step=0187400) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 19:59:29] (step=0187500) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 19:59:38] (step=0187600) Train Loss: 0.1531, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 19:59:48] (step=0187700) Train Loss: 0.1509, Train Steps/Sec: 10.03, Grad Norm: 0.0075 +[2025-02-19 19:59:56] (step=0187800) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:00:06] (step=0187900) Train Loss: 0.1506, Train Steps/Sec: 10.31, Grad Norm: 0.0074 +[2025-02-19 20:00:15] (step=0188000) Train Loss: 0.1526, Train Steps/Sec: 10.89, Grad Norm: 0.0074 +[2025-02-19 20:00:24] (step=0188100) Train Loss: 0.1517, Train Steps/Sec: 11.30, Grad Norm: 0.0072 +[2025-02-19 20:00:33] (step=0188200) Train Loss: 0.1515, Train Steps/Sec: 11.31, Grad Norm: 0.0073 +[2025-02-19 20:00:41] (step=0188300) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:00:50] (step=0188400) Train Loss: 0.1525, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:00:59] (step=0188500) Train Loss: 0.1525, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:01:07] (step=0188600) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:01:16] (step=0188700) Train Loss: 0.1509, Train Steps/Sec: 11.67, Grad Norm: 0.0077 +[2025-02-19 20:01:24] (step=0188800) Train Loss: 0.1510, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 20:01:33] (step=0188900) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 20:01:43] (step=0189000) Train Loss: 0.1529, Train Steps/Sec: 10.10, Grad Norm: 0.0074 +[2025-02-19 20:01:51] (step=0189100) Train Loss: 0.1517, Train Steps/Sec: 11.69, Grad Norm: 0.0077 +[2025-02-19 20:02:00] (step=0189200) Train Loss: 0.1508, Train Steps/Sec: 11.69, Grad Norm: 0.0075 +[2025-02-19 20:02:08] (step=0189300) Train Loss: 0.1520, Train Steps/Sec: 11.68, Grad Norm: 0.0075 +[2025-02-19 20:02:17] (step=0189400) Train Loss: 0.1513, Train Steps/Sec: 11.69, Grad Norm: 0.0073 +[2025-02-19 20:02:26] (step=0189500) Train Loss: 0.1515, Train Steps/Sec: 11.68, Grad Norm: 0.0076 +[2025-02-19 20:02:34] (step=0189600) Train Loss: 0.1527, Train Steps/Sec: 11.68, Grad Norm: 0.0071 +[2025-02-19 20:02:43] (step=0189700) Train Loss: 0.1529, Train Steps/Sec: 11.68, Grad Norm: 0.0075 +[2025-02-19 20:02:51] (step=0189800) Train Loss: 0.1505, Train Steps/Sec: 11.68, Grad Norm: 0.0072 +[2025-02-19 20:03:00] (step=0189900) Train Loss: 0.1506, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 20:03:08] (step=0190000) Train Loss: 0.1513, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-19 20:03:18] (step=0190100) Train Loss: 0.1519, Train Steps/Sec: 10.03, Grad Norm: 0.0072 +[2025-02-19 20:03:29] (step=0190200) Train Loss: 0.1513, Train Steps/Sec: 9.68, Grad Norm: 0.0071 +[2025-02-19 20:03:38] (step=0190300) Train Loss: 0.1510, Train Steps/Sec: 10.96, Grad Norm: 0.0075 +[2025-02-19 20:03:46] (step=0190400) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:03:55] (step=0190500) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:04:04] (step=0190600) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0074 +[2025-02-19 20:04:12] (step=0190700) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:04:21] (step=0190800) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:04:29] (step=0190900) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:04:38] (step=0191000) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:04:47] (step=0191100) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:04:55] (step=0191200) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:05:04] (step=0191300) Train Loss: 0.1492, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:05:12] (step=0191400) Train Loss: 0.1528, Train Steps/Sec: 11.61, Grad Norm: 0.0074 +[2025-02-19 20:05:22] (step=0191500) Train Loss: 0.1534, Train Steps/Sec: 10.00, Grad Norm: 0.0075 +[2025-02-19 20:05:31] (step=0191600) Train Loss: 0.1501, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:05:39] (step=0191700) Train Loss: 0.1517, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 20:05:48] (step=0191800) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:05:57] (step=0191900) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0076 +[2025-02-19 20:06:05] (step=0192000) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:06:14] (step=0192100) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:06:22] (step=0192200) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 20:06:32] (step=0192300) Train Loss: 0.1517, Train Steps/Sec: 10.02, Grad Norm: 0.0075 +[2025-02-19 20:06:41] (step=0192400) Train Loss: 0.1508, Train Steps/Sec: 10.93, Grad Norm: 0.0073 +[2025-02-19 20:06:50] (step=0192500) Train Loss: 0.1536, Train Steps/Sec: 11.31, Grad Norm: 0.0071 +[2025-02-19 20:06:59] (step=0192600) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:07:09] (step=0192700) Train Loss: 0.1511, Train Steps/Sec: 10.12, Grad Norm: 0.0072 +[2025-02-19 20:07:17] (step=0192800) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:07:26] (step=0192900) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:07:35] (step=0193000) Train Loss: 0.1519, Train Steps/Sec: 11.63, Grad Norm: 0.0074 +[2025-02-19 20:07:43] (step=0193100) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0074 +[2025-02-19 20:07:52] (step=0193200) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:08:00] (step=0193300) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 20:08:09] (step=0193400) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:08:18] (step=0193500) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:08:26] (step=0193600) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 20:08:35] (step=0193700) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:08:43] (step=0193800) Train Loss: 0.1519, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 20:08:52] (step=0193900) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0071 +[2025-02-19 20:09:02] (step=0194000) Train Loss: 0.1514, Train Steps/Sec: 10.10, Grad Norm: 0.0073 +[2025-02-19 20:09:10] (step=0194100) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:09:19] (step=0194200) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:09:28] (step=0194300) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0075 +[2025-02-19 20:09:37] (step=0194400) Train Loss: 0.1516, Train Steps/Sec: 10.59, Grad Norm: 0.0076 +[2025-02-19 20:09:46] (step=0194500) Train Loss: 0.1520, Train Steps/Sec: 10.60, Grad Norm: 0.0071 +[2025-02-19 20:09:55] (step=0194600) Train Loss: 0.1523, Train Steps/Sec: 11.29, Grad Norm: 0.0074 +[2025-02-19 20:10:04] (step=0194700) Train Loss: 0.1515, Train Steps/Sec: 11.30, Grad Norm: 0.0075 +[2025-02-19 20:10:13] (step=0194800) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:10:21] (step=0194900) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:10:30] (step=0195000) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:10:39] (step=0195100) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:10:49] (step=0195200) Train Loss: 0.1491, Train Steps/Sec: 10.01, Grad Norm: 0.0073 +[2025-02-19 20:10:57] (step=0195300) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:11:06] (step=0195400) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:11:14] (step=0195500) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:11:23] (step=0195600) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:11:31] (step=0195700) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 20:11:40] (step=0195800) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:11:49] (step=0195900) Train Loss: 0.1528, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:11:57] (step=0196000) Train Loss: 0.1518, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 20:12:06] (step=0196100) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:12:14] (step=0196200) Train Loss: 0.1519, Train Steps/Sec: 11.67, Grad Norm: 0.0070 +[2025-02-19 20:12:23] (step=0196300) Train Loss: 0.1513, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:12:31] (step=0196400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0074 +[2025-02-19 20:12:41] (step=0196500) Train Loss: 0.1511, Train Steps/Sec: 10.04, Grad Norm: 0.0074 +[2025-02-19 20:12:51] (step=0196600) Train Loss: 0.1512, Train Steps/Sec: 10.26, Grad Norm: 0.0079 +[2025-02-19 20:13:00] (step=0196700) Train Loss: 0.1508, Train Steps/Sec: 11.28, Grad Norm: 0.0073 +[2025-02-19 20:13:09] (step=0196800) Train Loss: 0.1505, Train Steps/Sec: 10.97, Grad Norm: 0.0070 +[2025-02-19 20:13:18] (step=0196900) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0077 +[2025-02-19 20:13:26] (step=0197000) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:13:35] (step=0197100) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:13:44] (step=0197200) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:13:52] (step=0197300) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 20:14:01] (step=0197400) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:14:09] (step=0197500) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:14:18] (step=0197600) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 20:14:28] (step=0197700) Train Loss: 0.1517, Train Steps/Sec: 10.08, Grad Norm: 0.0071 +[2025-02-19 20:14:36] (step=0197800) Train Loss: 0.1503, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 20:14:45] (step=0197900) Train Loss: 0.1492, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 20:14:54] (step=0198000) Train Loss: 0.1518, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:15:02] (step=0198100) Train Loss: 0.1522, Train Steps/Sec: 11.67, Grad Norm: 0.0070 +[2025-02-19 20:15:11] (step=0198200) Train Loss: 0.1513, Train Steps/Sec: 11.67, Grad Norm: 0.0070 +[2025-02-19 20:15:19] (step=0198300) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:15:28] (step=0198400) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0070 +[2025-02-19 20:15:36] (step=0198500) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:15:45] (step=0198600) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 20:15:54] (step=0198700) Train Loss: 0.1520, Train Steps/Sec: 11.67, Grad Norm: 0.0070 +[2025-02-19 20:16:03] (step=0198800) Train Loss: 0.1503, Train Steps/Sec: 10.28, Grad Norm: 0.0072 +[2025-02-19 20:16:12] (step=0198900) Train Loss: 0.1532, Train Steps/Sec: 11.23, Grad Norm: 0.0071 +[2025-02-19 20:16:23] (step=0199000) Train Loss: 0.1507, Train Steps/Sec: 9.58, Grad Norm: 0.0071 +[2025-02-19 20:16:31] (step=0199100) Train Loss: 0.1535, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 20:16:40] (step=0199200) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:16:48] (step=0199300) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 20:16:57] (step=0199400) Train Loss: 0.1502, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 20:17:06] (step=0199500) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:17:14] (step=0199600) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:17:23] (step=0199700) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:17:31] (step=0199800) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:17:40] (step=0199900) Train Loss: 0.1522, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 20:17:48] (step=0200000) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:17:49] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0200000.pt +[2025-02-19 20:17:58] (step=0200100) Train Loss: 0.1514, Train Steps/Sec: 10.47, Grad Norm: 0.0076 +[2025-02-19 20:18:08] (step=0200200) Train Loss: 0.1504, Train Steps/Sec: 10.03, Grad Norm: 0.0071 +[2025-02-19 20:18:17] (step=0200300) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:18:25] (step=0200400) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 20:18:34] (step=0200500) Train Loss: 0.1514, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:18:42] (step=0200600) Train Loss: 0.1520, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 20:18:51] (step=0200700) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:18:59] (step=0200800) Train Loss: 0.1515, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 20:19:08] (step=0200900) Train Loss: 0.1507, Train Steps/Sec: 11.28, Grad Norm: 0.0072 +[2025-02-19 20:19:18] (step=0201000) Train Loss: 0.1529, Train Steps/Sec: 10.62, Grad Norm: 0.0072 +[2025-02-19 20:19:27] (step=0201100) Train Loss: 0.1511, Train Steps/Sec: 11.29, Grad Norm: 0.0073 +[2025-02-19 20:19:36] (step=0201200) Train Loss: 0.1514, Train Steps/Sec: 10.97, Grad Norm: 0.0072 +[2025-02-19 20:19:44] (step=0201300) Train Loss: 0.1524, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 20:19:53] (step=0201400) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 20:20:03] (step=0201500) Train Loss: 0.1512, Train Steps/Sec: 10.08, Grad Norm: 0.0074 +[2025-02-19 20:20:11] (step=0201600) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:20:20] (step=0201700) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:20:29] (step=0201800) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 20:20:37] (step=0201900) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 20:20:46] (step=0202000) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 20:20:54] (step=0202100) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:21:03] (step=0202200) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0074 +[2025-02-19 20:21:11] (step=0202300) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:21:20] (step=0202400) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 20:21:29] (step=0202500) Train Loss: 0.1518, Train Steps/Sec: 11.66, Grad Norm: 0.0070 +[2025-02-19 20:21:37] (step=0202600) Train Loss: 0.1517, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 20:21:47] (step=0202700) Train Loss: 0.1500, Train Steps/Sec: 10.03, Grad Norm: 0.0072 +[2025-02-19 20:21:56] (step=0202800) Train Loss: 0.1520, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 20:22:04] (step=0202900) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:22:13] (step=0203000) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:22:22] (step=0203100) Train Loss: 0.1513, Train Steps/Sec: 10.95, Grad Norm: 0.0071 +[2025-02-19 20:22:32] (step=0203200) Train Loss: 0.1514, Train Steps/Sec: 10.57, Grad Norm: 0.0072 +[2025-02-19 20:22:40] (step=0203300) Train Loss: 0.1521, Train Steps/Sec: 11.31, Grad Norm: 0.0070 +[2025-02-19 20:22:49] (step=0203400) Train Loss: 0.1514, Train Steps/Sec: 11.28, Grad Norm: 0.0071 +[2025-02-19 20:22:58] (step=0203500) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0072 +[2025-02-19 20:23:06] (step=0203600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:23:15] (step=0203700) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:23:24] (step=0203800) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0075 +[2025-02-19 20:23:32] (step=0203900) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0074 +[2025-02-19 20:23:42] (step=0204000) Train Loss: 0.1524, Train Steps/Sec: 10.04, Grad Norm: 0.0075 +[2025-02-19 20:23:51] (step=0204100) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:23:59] (step=0204200) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:24:08] (step=0204300) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:24:17] (step=0204400) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:24:25] (step=0204500) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:24:34] (step=0204600) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0075 +[2025-02-19 20:24:42] (step=0204700) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:24:51] (step=0204800) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:24:59] (step=0204900) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:25:08] (step=0205000) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:25:17] (step=0205100) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:25:27] (step=0205200) Train Loss: 0.1521, Train Steps/Sec: 10.02, Grad Norm: 0.0071 +[2025-02-19 20:25:36] (step=0205300) Train Loss: 0.1527, Train Steps/Sec: 10.25, Grad Norm: 0.0071 +[2025-02-19 20:25:45] (step=0205400) Train Loss: 0.1516, Train Steps/Sec: 11.25, Grad Norm: 0.0072 +[2025-02-19 20:25:54] (step=0205500) Train Loss: 0.1506, Train Steps/Sec: 11.28, Grad Norm: 0.0071 +[2025-02-19 20:26:03] (step=0205600) Train Loss: 0.1504, Train Steps/Sec: 11.28, Grad Norm: 0.0072 +[2025-02-19 20:26:12] (step=0205700) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 20:26:20] (step=0205800) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:26:29] (step=0205900) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0075 +[2025-02-19 20:26:37] (step=0206000) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 20:26:46] (step=0206100) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:26:55] (step=0206200) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:27:03] (step=0206300) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:27:12] (step=0206400) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0071 +[2025-02-19 20:27:22] (step=0206500) Train Loss: 0.1505, Train Steps/Sec: 10.04, Grad Norm: 0.0071 +[2025-02-19 20:27:30] (step=0206600) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 20:27:39] (step=0206700) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:27:48] (step=0206800) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:27:56] (step=0206900) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:28:05] (step=0207000) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0074 +[2025-02-19 20:28:13] (step=0207100) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:28:22] (step=0207200) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:28:31] (step=0207300) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:28:39] (step=0207400) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:28:49] (step=0207500) Train Loss: 0.1520, Train Steps/Sec: 10.27, Grad Norm: 0.0071 +[2025-02-19 20:28:58] (step=0207600) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0070 +[2025-02-19 20:29:08] (step=0207700) Train Loss: 0.1505, Train Steps/Sec: 9.56, Grad Norm: 0.0070 +[2025-02-19 20:29:17] (step=0207800) Train Loss: 0.1525, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:29:25] (step=0207900) Train Loss: 0.1509, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 20:29:34] (step=0208000) Train Loss: 0.1509, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 20:29:43] (step=0208100) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0073 +[2025-02-19 20:29:51] (step=0208200) Train Loss: 0.1524, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:30:00] (step=0208300) Train Loss: 0.1522, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:30:08] (step=0208400) Train Loss: 0.1517, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 20:30:17] (step=0208500) Train Loss: 0.1510, Train Steps/Sec: 11.68, Grad Norm: 0.0072 +[2025-02-19 20:30:25] (step=0208600) Train Loss: 0.1516, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:30:34] (step=0208700) Train Loss: 0.1521, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:30:43] (step=0208800) Train Loss: 0.1500, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 20:30:51] (step=0208900) Train Loss: 0.1514, Train Steps/Sec: 11.72, Grad Norm: 0.0069 +[2025-02-19 20:31:01] (step=0209000) Train Loss: 0.1519, Train Steps/Sec: 10.21, Grad Norm: 0.0071 +[2025-02-19 20:31:09] (step=0209100) Train Loss: 0.1521, Train Steps/Sec: 11.77, Grad Norm: 0.0072 +[2025-02-19 20:31:18] (step=0209200) Train Loss: 0.1516, Train Steps/Sec: 11.72, Grad Norm: 0.0074 +[2025-02-19 20:31:26] (step=0209300) Train Loss: 0.1505, Train Steps/Sec: 11.70, Grad Norm: 0.0069 +[2025-02-19 20:31:35] (step=0209400) Train Loss: 0.1503, Train Steps/Sec: 11.71, Grad Norm: 0.0070 +[2025-02-19 20:31:44] (step=0209500) Train Loss: 0.1515, Train Steps/Sec: 11.71, Grad Norm: 0.0072 +[2025-02-19 20:31:52] (step=0209600) Train Loss: 0.1525, Train Steps/Sec: 11.32, Grad Norm: 0.0072 +[2025-02-19 20:32:01] (step=0209700) Train Loss: 0.1507, Train Steps/Sec: 10.97, Grad Norm: 0.0071 +[2025-02-19 20:32:10] (step=0209800) Train Loss: 0.1514, Train Steps/Sec: 11.31, Grad Norm: 0.0074 +[2025-02-19 20:32:19] (step=0209900) Train Loss: 0.1528, Train Steps/Sec: 11.00, Grad Norm: 0.0074 +[2025-02-19 20:32:28] (step=0210000) Train Loss: 0.1514, Train Steps/Sec: 11.69, Grad Norm: 0.0073 +[2025-02-19 20:32:37] (step=0210100) Train Loss: 0.1516, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 20:32:46] (step=0210200) Train Loss: 0.1521, Train Steps/Sec: 10.09, Grad Norm: 0.0074 +[2025-02-19 20:32:55] (step=0210300) Train Loss: 0.1517, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 20:33:04] (step=0210400) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 20:33:12] (step=0210500) Train Loss: 0.1507, Train Steps/Sec: 11.68, Grad Norm: 0.0071 +[2025-02-19 20:33:21] (step=0210600) Train Loss: 0.1500, Train Steps/Sec: 11.67, Grad Norm: 0.0073 +[2025-02-19 20:33:29] (step=0210700) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 20:33:38] (step=0210800) Train Loss: 0.1514, Train Steps/Sec: 11.68, Grad Norm: 0.0071 +[2025-02-19 20:33:46] (step=0210900) Train Loss: 0.1525, Train Steps/Sec: 11.68, Grad Norm: 0.0071 +[2025-02-19 20:33:55] (step=0211000) Train Loss: 0.1512, Train Steps/Sec: 11.68, Grad Norm: 0.0070 +[2025-02-19 20:34:04] (step=0211100) Train Loss: 0.1525, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 20:34:12] (step=0211200) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:34:21] (step=0211300) Train Loss: 0.1509, Train Steps/Sec: 11.67, Grad Norm: 0.0074 +[2025-02-19 20:34:29] (step=0211400) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 20:34:39] (step=0211500) Train Loss: 0.1499, Train Steps/Sec: 10.19, Grad Norm: 0.0072 +[2025-02-19 20:34:48] (step=0211600) Train Loss: 0.1515, Train Steps/Sec: 11.77, Grad Norm: 0.0070 +[2025-02-19 20:34:56] (step=0211700) Train Loss: 0.1526, Train Steps/Sec: 11.75, Grad Norm: 0.0075 +[2025-02-19 20:35:05] (step=0211800) Train Loss: 0.1512, Train Steps/Sec: 10.68, Grad Norm: 0.0071 +[2025-02-19 20:35:14] (step=0211900) Train Loss: 0.1508, Train Steps/Sec: 11.37, Grad Norm: 0.0070 +[2025-02-19 20:35:23] (step=0212000) Train Loss: 0.1505, Train Steps/Sec: 11.76, Grad Norm: 0.0073 +[2025-02-19 20:35:32] (step=0212100) Train Loss: 0.1515, Train Steps/Sec: 11.08, Grad Norm: 0.0073 +[2025-02-19 20:35:40] (step=0212200) Train Loss: 0.1510, Train Steps/Sec: 11.77, Grad Norm: 0.0070 +[2025-02-19 20:35:49] (step=0212300) Train Loss: 0.1511, Train Steps/Sec: 11.76, Grad Norm: 0.0073 +[2025-02-19 20:35:57] (step=0212400) Train Loss: 0.1526, Train Steps/Sec: 11.76, Grad Norm: 0.0072 +[2025-02-19 20:36:06] (step=0212500) Train Loss: 0.1519, Train Steps/Sec: 11.77, Grad Norm: 0.0072 +[2025-02-19 20:36:14] (step=0212600) Train Loss: 0.1519, Train Steps/Sec: 11.77, Grad Norm: 0.0070 +[2025-02-19 20:36:24] (step=0212700) Train Loss: 0.1501, Train Steps/Sec: 10.13, Grad Norm: 0.0072 +[2025-02-19 20:36:33] (step=0212800) Train Loss: 0.1513, Train Steps/Sec: 11.76, Grad Norm: 0.0071 +[2025-02-19 20:36:41] (step=0212900) Train Loss: 0.1506, Train Steps/Sec: 11.76, Grad Norm: 0.0070 +[2025-02-19 20:36:50] (step=0213000) Train Loss: 0.1514, Train Steps/Sec: 11.73, Grad Norm: 0.0071 +[2025-02-19 20:36:58] (step=0213100) Train Loss: 0.1511, Train Steps/Sec: 11.68, Grad Norm: 0.0072 +[2025-02-19 20:37:07] (step=0213200) Train Loss: 0.1523, Train Steps/Sec: 11.68, Grad Norm: 0.0073 +[2025-02-19 20:37:15] (step=0213300) Train Loss: 0.1506, Train Steps/Sec: 11.68, Grad Norm: 0.0069 +[2025-02-19 20:37:24] (step=0213400) Train Loss: 0.1507, Train Steps/Sec: 11.67, Grad Norm: 0.0067 +[2025-02-19 20:37:33] (step=0213500) Train Loss: 0.1514, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 20:37:41] (step=0213600) Train Loss: 0.1521, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 20:37:50] (step=0213700) Train Loss: 0.1522, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-19 20:37:58] (step=0213800) Train Loss: 0.1518, Train Steps/Sec: 11.67, Grad Norm: 0.0070 +[2025-02-19 20:38:07] (step=0213900) Train Loss: 0.1523, Train Steps/Sec: 11.62, Grad Norm: 0.0072 +[2025-02-19 20:38:18] (step=0214000) Train Loss: 0.1520, Train Steps/Sec: 9.11, Grad Norm: 0.0072 +[2025-02-19 20:38:27] (step=0214100) Train Loss: 0.1517, Train Steps/Sec: 11.26, Grad Norm: 0.0071 +[2025-02-19 20:38:36] (step=0214200) Train Loss: 0.1515, Train Steps/Sec: 11.31, Grad Norm: 0.0070 +[2025-02-19 20:38:44] (step=0214300) Train Loss: 0.1516, Train Steps/Sec: 11.33, Grad Norm: 0.0071 +[2025-02-19 20:38:53] (step=0214400) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0076 +[2025-02-19 20:39:02] (step=0214500) Train Loss: 0.1524, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 20:39:10] (step=0214600) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0076 +[2025-02-19 20:39:19] (step=0214700) Train Loss: 0.1512, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 20:39:27] (step=0214800) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:39:36] (step=0214900) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 20:39:44] (step=0215000) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0070 +[2025-02-19 20:39:53] (step=0215100) Train Loss: 0.1522, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 20:40:03] (step=0215200) Train Loss: 0.1493, Train Steps/Sec: 9.91, Grad Norm: 0.0075 +[2025-02-19 20:40:12] (step=0215300) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0074 +[2025-02-19 20:40:20] (step=0215400) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:40:29] (step=0215500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:40:38] (step=0215600) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:40:46] (step=0215700) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:40:55] (step=0215800) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 20:41:03] (step=0215900) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 20:41:12] (step=0216000) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 20:41:20] (step=0216100) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:41:30] (step=0216200) Train Loss: 0.1511, Train Steps/Sec: 10.30, Grad Norm: 0.0071 +[2025-02-19 20:41:39] (step=0216300) Train Loss: 0.1508, Train Steps/Sec: 11.28, Grad Norm: 0.0071 +[2025-02-19 20:41:48] (step=0216400) Train Loss: 0.1522, Train Steps/Sec: 10.95, Grad Norm: 0.0070 +[2025-02-19 20:41:58] (step=0216500) Train Loss: 0.1516, Train Steps/Sec: 10.10, Grad Norm: 0.0071 +[2025-02-19 20:42:07] (step=0216600) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 20:42:15] (step=0216700) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0071 +[2025-02-19 20:42:24] (step=0216800) Train Loss: 0.1526, Train Steps/Sec: 11.65, Grad Norm: 0.0074 +[2025-02-19 20:42:32] (step=0216900) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 20:42:41] (step=0217000) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:42:50] (step=0217100) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:42:58] (step=0217200) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 20:43:07] (step=0217300) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:43:15] (step=0217400) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:43:24] (step=0217500) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:43:33] (step=0217600) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:43:43] (step=0217700) Train Loss: 0.1522, Train Steps/Sec: 10.01, Grad Norm: 0.0070 +[2025-02-19 20:43:51] (step=0217800) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0071 +[2025-02-19 20:44:00] (step=0217900) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0071 +[2025-02-19 20:44:08] (step=0218000) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0069 +[2025-02-19 20:44:17] (step=0218100) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0074 +[2025-02-19 20:44:26] (step=0218200) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 20:44:35] (step=0218300) Train Loss: 0.1519, Train Steps/Sec: 11.23, Grad Norm: 0.0071 +[2025-02-19 20:44:44] (step=0218400) Train Loss: 0.1517, Train Steps/Sec: 10.30, Grad Norm: 0.0070 +[2025-02-19 20:44:53] (step=0218500) Train Loss: 0.1519, Train Steps/Sec: 11.23, Grad Norm: 0.0070 +[2025-02-19 20:45:02] (step=0218600) Train Loss: 0.1511, Train Steps/Sec: 10.95, Grad Norm: 0.0071 +[2025-02-19 20:45:11] (step=0218700) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:45:20] (step=0218800) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:45:28] (step=0218900) Train Loss: 0.1514, Train Steps/Sec: 11.59, Grad Norm: 0.0070 +[2025-02-19 20:45:38] (step=0219000) Train Loss: 0.1524, Train Steps/Sec: 10.03, Grad Norm: 0.0072 +[2025-02-19 20:45:47] (step=0219100) Train Loss: 0.1512, Train Steps/Sec: 11.67, Grad Norm: 0.0070 +[2025-02-19 20:45:55] (step=0219200) Train Loss: 0.1527, Train Steps/Sec: 11.60, Grad Norm: 0.0071 +[2025-02-19 20:46:04] (step=0219300) Train Loss: 0.1514, Train Steps/Sec: 11.59, Grad Norm: 0.0069 +[2025-02-19 20:46:13] (step=0219400) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0070 +[2025-02-19 20:46:21] (step=0219500) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0075 +[2025-02-19 20:46:30] (step=0219600) Train Loss: 0.1517, Train Steps/Sec: 11.59, Grad Norm: 0.0071 +[2025-02-19 20:46:39] (step=0219700) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0072 +[2025-02-19 20:46:47] (step=0219800) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0072 +[2025-02-19 20:46:56] (step=0219900) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0070 +[2025-02-19 20:47:04] (step=0220000) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0074 +[2025-02-19 20:47:05] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0220000.pt +[2025-02-19 20:47:14] (step=0220100) Train Loss: 0.1511, Train Steps/Sec: 10.57, Grad Norm: 0.0069 +[2025-02-19 20:47:24] (step=0220200) Train Loss: 0.1494, Train Steps/Sec: 9.97, Grad Norm: 0.0071 +[2025-02-19 20:47:32] (step=0220300) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:47:41] (step=0220400) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:47:51] (step=0220500) Train Loss: 0.1522, Train Steps/Sec: 9.99, Grad Norm: 0.0071 +[2025-02-19 20:48:00] (step=0220600) Train Loss: 0.1517, Train Steps/Sec: 11.26, Grad Norm: 0.0070 +[2025-02-19 20:48:09] (step=0220700) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 20:48:18] (step=0220800) Train Loss: 0.1524, Train Steps/Sec: 10.97, Grad Norm: 0.0071 +[2025-02-19 20:48:26] (step=0220900) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:48:35] (step=0221000) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 20:48:44] (step=0221100) Train Loss: 0.1512, Train Steps/Sec: 11.54, Grad Norm: 0.0072 +[2025-02-19 20:48:52] (step=0221200) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:49:01] (step=0221300) Train Loss: 0.1520, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 20:49:09] (step=0221400) Train Loss: 0.1507, Train Steps/Sec: 11.57, Grad Norm: 0.0070 +[2025-02-19 20:49:19] (step=0221500) Train Loss: 0.1518, Train Steps/Sec: 10.04, Grad Norm: 0.0072 +[2025-02-19 20:49:28] (step=0221600) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:49:37] (step=0221700) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:49:45] (step=0221800) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:49:54] (step=0221900) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:50:02] (step=0222000) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:50:11] (step=0222100) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:50:20] (step=0222200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:50:28] (step=0222300) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0072 +[2025-02-19 20:50:37] (step=0222400) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 20:50:45] (step=0222500) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 20:50:54] (step=0222600) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:51:05] (step=0222700) Train Loss: 0.1511, Train Steps/Sec: 8.78, Grad Norm: 0.0070 +[2025-02-19 20:51:14] (step=0222800) Train Loss: 0.1522, Train Steps/Sec: 11.22, Grad Norm: 0.0071 +[2025-02-19 20:51:23] (step=0222900) Train Loss: 0.1508, Train Steps/Sec: 11.26, Grad Norm: 0.0072 +[2025-02-19 20:51:32] (step=0223000) Train Loss: 0.1507, Train Steps/Sec: 11.29, Grad Norm: 0.0068 +[2025-02-19 20:51:41] (step=0223100) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 20:51:49] (step=0223200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:51:58] (step=0223300) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:52:07] (step=0223400) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 20:52:15] (step=0223500) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:52:24] (step=0223600) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:52:32] (step=0223700) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 20:52:41] (step=0223800) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 20:52:50] (step=0223900) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0072 +[2025-02-19 20:53:00] (step=0224000) Train Loss: 0.1520, Train Steps/Sec: 9.98, Grad Norm: 0.0069 +[2025-02-19 20:53:08] (step=0224100) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:53:17] (step=0224200) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 20:53:25] (step=0224300) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0074 +[2025-02-19 20:53:34] (step=0224400) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0072 +[2025-02-19 20:53:43] (step=0224500) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 20:53:51] (step=0224600) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:54:00] (step=0224700) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 20:54:08] (step=0224800) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 20:54:18] (step=0224900) Train Loss: 0.1519, Train Steps/Sec: 10.02, Grad Norm: 0.0068 +[2025-02-19 20:54:27] (step=0225000) Train Loss: 0.1500, Train Steps/Sec: 11.26, Grad Norm: 0.0070 +[2025-02-19 20:54:36] (step=0225100) Train Loss: 0.1508, Train Steps/Sec: 10.96, Grad Norm: 0.0071 +[2025-02-19 20:54:46] (step=0225200) Train Loss: 0.1503, Train Steps/Sec: 10.05, Grad Norm: 0.0071 +[2025-02-19 20:54:55] (step=0225300) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:55:03] (step=0225400) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:55:12] (step=0225500) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:55:21] (step=0225600) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:55:29] (step=0225700) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:55:38] (step=0225800) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:55:46] (step=0225900) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:55:55] (step=0226000) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:56:04] (step=0226100) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 20:56:12] (step=0226200) Train Loss: 0.1519, Train Steps/Sec: 11.63, Grad Norm: 0.0074 +[2025-02-19 20:56:21] (step=0226300) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 20:56:30] (step=0226400) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0069 +[2025-02-19 20:56:39] (step=0226500) Train Loss: 0.1512, Train Steps/Sec: 10.08, Grad Norm: 0.0072 +[2025-02-19 20:56:48] (step=0226600) Train Loss: 0.1534, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 20:56:57] (step=0226700) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 20:57:05] (step=0226800) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:57:14] (step=0226900) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 20:57:23] (step=0227000) Train Loss: 0.1519, Train Steps/Sec: 10.92, Grad Norm: 0.0070 +[2025-02-19 20:57:33] (step=0227100) Train Loss: 0.1507, Train Steps/Sec: 10.28, Grad Norm: 0.0069 +[2025-02-19 20:57:41] (step=0227200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 20:57:50] (step=0227300) Train Loss: 0.1512, Train Steps/Sec: 10.96, Grad Norm: 0.0071 +[2025-02-19 20:57:59] (step=0227400) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:58:08] (step=0227500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 20:58:16] (step=0227600) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:58:26] (step=0227700) Train Loss: 0.1525, Train Steps/Sec: 10.00, Grad Norm: 0.0070 +[2025-02-19 20:58:35] (step=0227800) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 20:58:43] (step=0227900) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:58:52] (step=0228000) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 20:59:01] (step=0228100) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:59:09] (step=0228200) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 20:59:18] (step=0228300) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 20:59:26] (step=0228400) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 20:59:35] (step=0228500) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 20:59:44] (step=0228600) Train Loss: 0.1526, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 20:59:52] (step=0228700) Train Loss: 0.1523, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:00:01] (step=0228800) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 21:00:09] (step=0228900) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:00:20] (step=0229000) Train Loss: 0.1519, Train Steps/Sec: 9.90, Grad Norm: 0.0072 +[2025-02-19 21:00:28] (step=0229100) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:00:38] (step=0229200) Train Loss: 0.1519, Train Steps/Sec: 10.28, Grad Norm: 0.0071 +[2025-02-19 21:00:47] (step=0229300) Train Loss: 0.1503, Train Steps/Sec: 10.89, Grad Norm: 0.0070 +[2025-02-19 21:00:56] (step=0229400) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 21:01:05] (step=0229500) Train Loss: 0.1519, Train Steps/Sec: 10.96, Grad Norm: 0.0068 +[2025-02-19 21:01:13] (step=0229600) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:01:22] (step=0229700) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:01:31] (step=0229800) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:01:39] (step=0229900) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:01:48] (step=0230000) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 21:01:56] (step=0230100) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 21:02:06] (step=0230200) Train Loss: 0.1516, Train Steps/Sec: 9.95, Grad Norm: 0.0070 +[2025-02-19 21:02:15] (step=0230300) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:02:24] (step=0230400) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 21:02:32] (step=0230500) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:02:41] (step=0230600) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:02:49] (step=0230700) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 21:02:58] (step=0230800) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:03:07] (step=0230900) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0073 +[2025-02-19 21:03:15] (step=0231000) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:03:24] (step=0231100) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:03:32] (step=0231200) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 21:03:41] (step=0231300) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 21:03:51] (step=0231400) Train Loss: 0.1523, Train Steps/Sec: 10.00, Grad Norm: 0.0068 +[2025-02-19 21:04:01] (step=0231500) Train Loss: 0.1517, Train Steps/Sec: 9.75, Grad Norm: 0.0071 +[2025-02-19 21:04:10] (step=0231600) Train Loss: 0.1522, Train Steps/Sec: 11.26, Grad Norm: 0.0072 +[2025-02-19 21:04:19] (step=0231700) Train Loss: 0.1529, Train Steps/Sec: 11.30, Grad Norm: 0.0071 +[2025-02-19 21:04:27] (step=0231800) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:04:36] (step=0231900) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:04:45] (step=0232000) Train Loss: 0.1528, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:04:53] (step=0232100) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:05:02] (step=0232200) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 21:05:11] (step=0232300) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:05:19] (step=0232400) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:05:28] (step=0232500) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:05:36] (step=0232600) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 21:05:46] (step=0232700) Train Loss: 0.1506, Train Steps/Sec: 10.02, Grad Norm: 0.0071 +[2025-02-19 21:05:55] (step=0232800) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:06:04] (step=0232900) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:06:12] (step=0233000) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 21:06:21] (step=0233100) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 21:06:29] (step=0233200) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:06:38] (step=0233300) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0072 +[2025-02-19 21:06:47] (step=0233400) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:06:55] (step=0233500) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0071 +[2025-02-19 21:07:05] (step=0233600) Train Loss: 0.1512, Train Steps/Sec: 10.28, Grad Norm: 0.0071 +[2025-02-19 21:07:14] (step=0233700) Train Loss: 0.1512, Train Steps/Sec: 11.23, Grad Norm: 0.0073 +[2025-02-19 21:07:23] (step=0233800) Train Loss: 0.1505, Train Steps/Sec: 10.95, Grad Norm: 0.0069 +[2025-02-19 21:07:32] (step=0233900) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:07:42] (step=0234000) Train Loss: 0.1518, Train Steps/Sec: 10.05, Grad Norm: 0.0069 +[2025-02-19 21:07:50] (step=0234100) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:07:59] (step=0234200) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:08:08] (step=0234300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:08:16] (step=0234400) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:08:25] (step=0234500) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 21:08:33] (step=0234600) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:08:42] (step=0234700) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:08:51] (step=0234800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:08:59] (step=0234900) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:09:08] (step=0235000) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:09:16] (step=0235100) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 21:09:26] (step=0235200) Train Loss: 0.1518, Train Steps/Sec: 10.05, Grad Norm: 0.0073 +[2025-02-19 21:09:35] (step=0235300) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:09:44] (step=0235400) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:09:52] (step=0235500) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:10:01] (step=0235600) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:10:11] (step=0235700) Train Loss: 0.1527, Train Steps/Sec: 10.23, Grad Norm: 0.0069 +[2025-02-19 21:10:20] (step=0235800) Train Loss: 0.1513, Train Steps/Sec: 10.88, Grad Norm: 0.0070 +[2025-02-19 21:10:28] (step=0235900) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:10:37] (step=0236000) Train Loss: 0.1504, Train Steps/Sec: 10.95, Grad Norm: 0.0068 +[2025-02-19 21:10:46] (step=0236100) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 21:10:55] (step=0236200) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0071 +[2025-02-19 21:11:03] (step=0236300) Train Loss: 0.1519, Train Steps/Sec: 11.59, Grad Norm: 0.0070 +[2025-02-19 21:11:12] (step=0236400) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:11:22] (step=0236500) Train Loss: 0.1517, Train Steps/Sec: 10.00, Grad Norm: 0.0071 +[2025-02-19 21:11:31] (step=0236600) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0072 +[2025-02-19 21:11:39] (step=0236700) Train Loss: 0.1539, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:11:48] (step=0236800) Train Loss: 0.1527, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 21:11:56] (step=0236900) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:12:05] (step=0237000) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0072 +[2025-02-19 21:12:14] (step=0237100) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0070 +[2025-02-19 21:12:22] (step=0237200) Train Loss: 0.1521, Train Steps/Sec: 11.59, Grad Norm: 0.0072 +[2025-02-19 21:12:31] (step=0237300) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:12:39] (step=0237400) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:12:48] (step=0237500) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:12:57] (step=0237600) Train Loss: 0.1530, Train Steps/Sec: 11.61, Grad Norm: 0.0072 +[2025-02-19 21:13:07] (step=0237700) Train Loss: 0.1503, Train Steps/Sec: 9.98, Grad Norm: 0.0071 +[2025-02-19 21:13:15] (step=0237800) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:13:25] (step=0237900) Train Loss: 0.1517, Train Steps/Sec: 9.99, Grad Norm: 0.0068 +[2025-02-19 21:13:34] (step=0238000) Train Loss: 0.1509, Train Steps/Sec: 11.25, Grad Norm: 0.0070 +[2025-02-19 21:13:43] (step=0238100) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:13:52] (step=0238200) Train Loss: 0.1511, Train Steps/Sec: 10.96, Grad Norm: 0.0072 +[2025-02-19 21:14:01] (step=0238300) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 21:14:09] (step=0238400) Train Loss: 0.1519, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:14:18] (step=0238500) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:14:26] (step=0238600) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 21:14:35] (step=0238700) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:14:44] (step=0238800) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:14:52] (step=0238900) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:15:02] (step=0239000) Train Loss: 0.1511, Train Steps/Sec: 10.08, Grad Norm: 0.0071 +[2025-02-19 21:15:11] (step=0239100) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 21:15:19] (step=0239200) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 21:15:28] (step=0239300) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:15:37] (step=0239400) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:15:45] (step=0239500) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:15:54] (step=0239600) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:16:02] (step=0239700) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:16:11] (step=0239800) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:16:20] (step=0239900) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0071 +[2025-02-19 21:16:28] (step=0240000) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:16:29] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0240000.pt +[2025-02-19 21:16:39] (step=0240100) Train Loss: 0.1507, Train Steps/Sec: 9.57, Grad Norm: 0.0069 +[2025-02-19 21:16:49] (step=0240200) Train Loss: 0.1503, Train Steps/Sec: 9.72, Grad Norm: 0.0068 +[2025-02-19 21:16:58] (step=0240300) Train Loss: 0.1511, Train Steps/Sec: 11.25, Grad Norm: 0.0070 +[2025-02-19 21:17:07] (step=0240400) Train Loss: 0.1525, Train Steps/Sec: 11.26, Grad Norm: 0.0067 +[2025-02-19 21:17:15] (step=0240500) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:17:24] (step=0240600) Train Loss: 0.1533, Train Steps/Sec: 11.62, Grad Norm: 0.0072 +[2025-02-19 21:17:33] (step=0240700) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 21:17:41] (step=0240800) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:17:50] (step=0240900) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:17:58] (step=0241000) Train Loss: 0.1513, Train Steps/Sec: 11.58, Grad Norm: 0.0068 +[2025-02-19 21:18:07] (step=0241100) Train Loss: 0.1509, Train Steps/Sec: 11.54, Grad Norm: 0.0072 +[2025-02-19 21:18:16] (step=0241200) Train Loss: 0.1500, Train Steps/Sec: 11.55, Grad Norm: 0.0067 +[2025-02-19 21:18:24] (step=0241300) Train Loss: 0.1526, Train Steps/Sec: 11.55, Grad Norm: 0.0068 +[2025-02-19 21:18:33] (step=0241400) Train Loss: 0.1493, Train Steps/Sec: 11.57, Grad Norm: 0.0071 +[2025-02-19 21:18:43] (step=0241500) Train Loss: 0.1514, Train Steps/Sec: 9.98, Grad Norm: 0.0069 +[2025-02-19 21:18:52] (step=0241600) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0072 +[2025-02-19 21:19:00] (step=0241700) Train Loss: 0.1524, Train Steps/Sec: 11.59, Grad Norm: 0.0070 +[2025-02-19 21:19:09] (step=0241800) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0070 +[2025-02-19 21:19:18] (step=0241900) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:19:26] (step=0242000) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 21:19:35] (step=0242100) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0070 +[2025-02-19 21:19:44] (step=0242200) Train Loss: 0.1503, Train Steps/Sec: 10.59, Grad Norm: 0.0068 +[2025-02-19 21:19:53] (step=0242300) Train Loss: 0.1504, Train Steps/Sec: 10.88, Grad Norm: 0.0068 +[2025-02-19 21:20:02] (step=0242400) Train Loss: 0.1514, Train Steps/Sec: 11.22, Grad Norm: 0.0069 +[2025-02-19 21:20:12] (step=0242500) Train Loss: 0.1517, Train Steps/Sec: 10.86, Grad Norm: 0.0069 +[2025-02-19 21:20:20] (step=0242600) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:20:30] (step=0242700) Train Loss: 0.1506, Train Steps/Sec: 9.93, Grad Norm: 0.0070 +[2025-02-19 21:20:39] (step=0242800) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:20:48] (step=0242900) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:20:56] (step=0243000) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 21:21:05] (step=0243100) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:21:13] (step=0243200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:21:22] (step=0243300) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:21:31] (step=0243400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 21:21:39] (step=0243500) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 21:21:48] (step=0243600) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0074 +[2025-02-19 21:21:56] (step=0243700) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:22:05] (step=0243800) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0070 +[2025-02-19 21:22:14] (step=0243900) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:22:24] (step=0244000) Train Loss: 0.1501, Train Steps/Sec: 9.81, Grad Norm: 0.0069 +[2025-02-19 21:22:32] (step=0244100) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 21:22:41] (step=0244200) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:22:50] (step=0244300) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0071 +[2025-02-19 21:22:59] (step=0244400) Train Loss: 0.1518, Train Steps/Sec: 10.27, Grad Norm: 0.0069 +[2025-02-19 21:23:09] (step=0244500) Train Loss: 0.1518, Train Steps/Sec: 10.86, Grad Norm: 0.0068 +[2025-02-19 21:23:17] (step=0244600) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 21:23:26] (step=0244700) Train Loss: 0.1504, Train Steps/Sec: 10.90, Grad Norm: 0.0068 +[2025-02-19 21:23:35] (step=0244800) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 21:23:44] (step=0244900) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 21:23:52] (step=0245000) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:24:01] (step=0245100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0072 +[2025-02-19 21:24:11] (step=0245200) Train Loss: 0.1506, Train Steps/Sec: 9.88, Grad Norm: 0.0072 +[2025-02-19 21:24:20] (step=0245300) Train Loss: 0.1507, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-19 21:24:28] (step=0245400) Train Loss: 0.1525, Train Steps/Sec: 11.68, Grad Norm: 0.0071 +[2025-02-19 21:24:37] (step=0245500) Train Loss: 0.1504, Train Steps/Sec: 11.68, Grad Norm: 0.0069 +[2025-02-19 21:24:45] (step=0245600) Train Loss: 0.1511, Train Steps/Sec: 11.69, Grad Norm: 0.0069 +[2025-02-19 21:24:54] (step=0245700) Train Loss: 0.1507, Train Steps/Sec: 11.69, Grad Norm: 0.0067 +[2025-02-19 21:25:02] (step=0245800) Train Loss: 0.1507, Train Steps/Sec: 11.68, Grad Norm: 0.0070 +[2025-02-19 21:25:11] (step=0245900) Train Loss: 0.1506, Train Steps/Sec: 11.68, Grad Norm: 0.0069 +[2025-02-19 21:25:20] (step=0246000) Train Loss: 0.1515, Train Steps/Sec: 11.69, Grad Norm: 0.0067 +[2025-02-19 21:25:28] (step=0246100) Train Loss: 0.1515, Train Steps/Sec: 11.69, Grad Norm: 0.0073 +[2025-02-19 21:25:37] (step=0246200) Train Loss: 0.1508, Train Steps/Sec: 11.69, Grad Norm: 0.0069 +[2025-02-19 21:25:45] (step=0246300) Train Loss: 0.1508, Train Steps/Sec: 11.68, Grad Norm: 0.0067 +[2025-02-19 21:25:54] (step=0246400) Train Loss: 0.1498, Train Steps/Sec: 11.69, Grad Norm: 0.0067 +[2025-02-19 21:26:04] (step=0246500) Train Loss: 0.1505, Train Steps/Sec: 10.06, Grad Norm: 0.0067 +[2025-02-19 21:26:14] (step=0246600) Train Loss: 0.1519, Train Steps/Sec: 9.99, Grad Norm: 0.0070 +[2025-02-19 21:26:23] (step=0246700) Train Loss: 0.1516, Train Steps/Sec: 11.29, Grad Norm: 0.0070 +[2025-02-19 21:26:31] (step=0246800) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:26:40] (step=0246900) Train Loss: 0.1522, Train Steps/Sec: 10.93, Grad Norm: 0.0068 +[2025-02-19 21:26:49] (step=0247000) Train Loss: 0.1516, Train Steps/Sec: 11.67, Grad Norm: 0.0068 +[2025-02-19 21:26:57] (step=0247100) Train Loss: 0.1496, Train Steps/Sec: 11.66, Grad Norm: 0.0067 +[2025-02-19 21:27:06] (step=0247200) Train Loss: 0.1523, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 21:27:15] (step=0247300) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 21:27:23] (step=0247400) Train Loss: 0.1513, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 21:27:32] (step=0247500) Train Loss: 0.1513, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 21:27:40] (step=0247600) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:27:50] (step=0247700) Train Loss: 0.1503, Train Steps/Sec: 10.01, Grad Norm: 0.0069 +[2025-02-19 21:27:59] (step=0247800) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 21:28:08] (step=0247900) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:28:16] (step=0248000) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:28:25] (step=0248100) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0072 +[2025-02-19 21:28:33] (step=0248200) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:28:42] (step=0248300) Train Loss: 0.1520, Train Steps/Sec: 11.66, Grad Norm: 0.0071 +[2025-02-19 21:28:50] (step=0248400) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:28:59] (step=0248500) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:29:08] (step=0248600) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:29:16] (step=0248700) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:29:26] (step=0248800) Train Loss: 0.1528, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-19 21:29:35] (step=0248900) Train Loss: 0.1511, Train Steps/Sec: 11.29, Grad Norm: 0.0070 +[2025-02-19 21:29:46] (step=0249000) Train Loss: 0.1518, Train Steps/Sec: 9.52, Grad Norm: 0.0069 +[2025-02-19 21:29:54] (step=0249100) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 21:30:03] (step=0249200) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:30:11] (step=0249300) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:30:20] (step=0249400) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:30:29] (step=0249500) Train Loss: 0.1487, Train Steps/Sec: 11.66, Grad Norm: 0.0070 +[2025-02-19 21:30:37] (step=0249600) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:30:46] (step=0249700) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 21:30:54] (step=0249800) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:31:03] (step=0249900) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 21:31:12] (step=0250000) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:31:20] (step=0250100) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:31:29] (step=0250200) Train Loss: 0.1524, Train Steps/Sec: 10.83, Grad Norm: 0.0068 +[2025-02-19 21:31:39] (step=0250300) Train Loss: 0.1514, Train Steps/Sec: 10.69, Grad Norm: 0.0067 +[2025-02-19 21:31:47] (step=0250400) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:31:56] (step=0250500) Train Loss: 0.1522, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:32:05] (step=0250600) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:32:13] (step=0250700) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:32:22] (step=0250800) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:32:31] (step=0250900) Train Loss: 0.1507, Train Steps/Sec: 10.63, Grad Norm: 0.0069 +[2025-02-19 21:32:40] (step=0251000) Train Loss: 0.1522, Train Steps/Sec: 10.91, Grad Norm: 0.0068 +[2025-02-19 21:32:49] (step=0251100) Train Loss: 0.1498, Train Steps/Sec: 11.28, Grad Norm: 0.0070 +[2025-02-19 21:32:58] (step=0251200) Train Loss: 0.1501, Train Steps/Sec: 10.93, Grad Norm: 0.0069 +[2025-02-19 21:33:07] (step=0251300) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 21:33:16] (step=0251400) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:33:26] (step=0251500) Train Loss: 0.1507, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-19 21:33:34] (step=0251600) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:33:43] (step=0251700) Train Loss: 0.1501, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 21:33:51] (step=0251800) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 21:34:00] (step=0251900) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:34:09] (step=0252000) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:34:17] (step=0252100) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:34:26] (step=0252200) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:34:34] (step=0252300) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:34:43] (step=0252400) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:34:52] (step=0252500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:35:00] (step=0252600) Train Loss: 0.1526, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 21:35:09] (step=0252700) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0072 +[2025-02-19 21:35:19] (step=0252800) Train Loss: 0.1516, Train Steps/Sec: 10.04, Grad Norm: 0.0068 +[2025-02-19 21:35:27] (step=0252900) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:35:36] (step=0253000) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0073 +[2025-02-19 21:35:46] (step=0253100) Train Loss: 0.1511, Train Steps/Sec: 10.01, Grad Norm: 0.0068 +[2025-02-19 21:35:55] (step=0253200) Train Loss: 0.1505, Train Steps/Sec: 11.30, Grad Norm: 0.0069 +[2025-02-19 21:36:03] (step=0253300) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0072 +[2025-02-19 21:36:12] (step=0253400) Train Loss: 0.1508, Train Steps/Sec: 10.96, Grad Norm: 0.0068 +[2025-02-19 21:36:21] (step=0253500) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 21:36:30] (step=0253600) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 21:36:38] (step=0253700) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 21:36:47] (step=0253800) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:36:55] (step=0253900) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:37:05] (step=0254000) Train Loss: 0.1497, Train Steps/Sec: 10.04, Grad Norm: 0.0068 +[2025-02-19 21:37:14] (step=0254100) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:37:22] (step=0254200) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:37:31] (step=0254300) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:37:40] (step=0254400) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:37:48] (step=0254500) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:37:57] (step=0254600) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:38:05] (step=0254700) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0072 +[2025-02-19 21:38:14] (step=0254800) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:38:23] (step=0254900) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:38:31] (step=0255000) Train Loss: 0.1523, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:38:40] (step=0255100) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:38:48] (step=0255200) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 21:39:00] (step=0255300) Train Loss: 0.1509, Train Steps/Sec: 8.79, Grad Norm: 0.0066 +[2025-02-19 21:39:09] (step=0255400) Train Loss: 0.1513, Train Steps/Sec: 11.27, Grad Norm: 0.0067 +[2025-02-19 21:39:17] (step=0255500) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 21:39:26] (step=0255600) Train Loss: 0.1523, Train Steps/Sec: 10.92, Grad Norm: 0.0069 +[2025-02-19 21:39:35] (step=0255700) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:39:44] (step=0255800) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:39:52] (step=0255900) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:40:01] (step=0256000) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:40:09] (step=0256100) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:40:18] (step=0256200) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:40:27] (step=0256300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:40:35] (step=0256400) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:40:45] (step=0256500) Train Loss: 0.1505, Train Steps/Sec: 9.94, Grad Norm: 0.0067 +[2025-02-19 21:40:54] (step=0256600) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:41:02] (step=0256700) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 21:41:11] (step=0256800) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:41:20] (step=0256900) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:41:28] (step=0257000) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:41:37] (step=0257100) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:41:45] (step=0257200) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:41:54] (step=0257300) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:42:03] (step=0257400) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:42:13] (step=0257500) Train Loss: 0.1514, Train Steps/Sec: 9.99, Grad Norm: 0.0070 +[2025-02-19 21:42:21] (step=0257600) Train Loss: 0.1504, Train Steps/Sec: 11.29, Grad Norm: 0.0067 +[2025-02-19 21:42:30] (step=0257700) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:42:41] (step=0257800) Train Loss: 0.1511, Train Steps/Sec: 9.54, Grad Norm: 0.0068 +[2025-02-19 21:42:49] (step=0257900) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:42:58] (step=0258000) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 21:43:06] (step=0258100) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:43:15] (step=0258200) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 21:43:24] (step=0258300) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 21:43:32] (step=0258400) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:43:41] (step=0258500) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:43:49] (step=0258600) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-19 21:43:58] (step=0258700) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:44:06] (step=0258800) Train Loss: 0.1513, Train Steps/Sec: 11.66, Grad Norm: 0.0067 +[2025-02-19 21:44:15] (step=0258900) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:44:25] (step=0259000) Train Loss: 0.1500, Train Steps/Sec: 9.93, Grad Norm: 0.0071 +[2025-02-19 21:44:34] (step=0259100) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0070 +[2025-02-19 21:44:42] (step=0259200) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:44:51] (step=0259300) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:44:59] (step=0259400) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 21:45:08] (step=0259500) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:45:17] (step=0259600) Train Loss: 0.1525, Train Steps/Sec: 10.63, Grad Norm: 0.0068 +[2025-02-19 21:45:27] (step=0259700) Train Loss: 0.1501, Train Steps/Sec: 10.91, Grad Norm: 0.0066 +[2025-02-19 21:45:35] (step=0259800) Train Loss: 0.1504, Train Steps/Sec: 11.29, Grad Norm: 0.0067 +[2025-02-19 21:45:45] (step=0259900) Train Loss: 0.1524, Train Steps/Sec: 10.89, Grad Norm: 0.0067 +[2025-02-19 21:45:53] (step=0260000) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:45:54] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0260000.pt +[2025-02-19 21:46:03] (step=0260100) Train Loss: 0.1509, Train Steps/Sec: 10.56, Grad Norm: 0.0069 +[2025-02-19 21:46:11] (step=0260200) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0072 +[2025-02-19 21:46:21] (step=0260300) Train Loss: 0.1509, Train Steps/Sec: 10.05, Grad Norm: 0.0069 +[2025-02-19 21:46:30] (step=0260400) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:46:39] (step=0260500) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:46:47] (step=0260600) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:46:56] (step=0260700) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 21:47:04] (step=0260800) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:47:13] (step=0260900) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:47:22] (step=0261000) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 21:47:30] (step=0261100) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:47:39] (step=0261200) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:47:47] (step=0261300) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:47:56] (step=0261400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:48:06] (step=0261500) Train Loss: 0.1515, Train Steps/Sec: 10.04, Grad Norm: 0.0072 +[2025-02-19 21:48:15] (step=0261600) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:48:23] (step=0261700) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 21:48:33] (step=0261800) Train Loss: 0.1514, Train Steps/Sec: 9.99, Grad Norm: 0.0068 +[2025-02-19 21:48:42] (step=0261900) Train Loss: 0.1518, Train Steps/Sec: 11.26, Grad Norm: 0.0067 +[2025-02-19 21:48:51] (step=0262000) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 21:49:00] (step=0262100) Train Loss: 0.1506, Train Steps/Sec: 10.92, Grad Norm: 0.0072 +[2025-02-19 21:49:08] (step=0262200) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:49:17] (step=0262300) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:49:26] (step=0262400) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:49:34] (step=0262500) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 21:49:43] (step=0262600) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:49:51] (step=0262700) Train Loss: 0.1515, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 21:50:01] (step=0262800) Train Loss: 0.1508, Train Steps/Sec: 10.03, Grad Norm: 0.0068 +[2025-02-19 21:50:10] (step=0262900) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 21:50:19] (step=0263000) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:50:27] (step=0263100) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 21:50:36] (step=0263200) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:50:44] (step=0263300) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0071 +[2025-02-19 21:50:53] (step=0263400) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:51:02] (step=0263500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:51:10] (step=0263600) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:51:19] (step=0263700) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:51:27] (step=0263800) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:51:36] (step=0263900) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:51:47] (step=0264000) Train Loss: 0.1509, Train Steps/Sec: 8.71, Grad Norm: 0.0067 +[2025-02-19 21:51:56] (step=0264100) Train Loss: 0.1507, Train Steps/Sec: 11.23, Grad Norm: 0.0067 +[2025-02-19 21:52:05] (step=0264200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 21:52:14] (step=0264300) Train Loss: 0.1503, Train Steps/Sec: 10.93, Grad Norm: 0.0066 +[2025-02-19 21:52:23] (step=0264400) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0070 +[2025-02-19 21:52:31] (step=0264500) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:52:40] (step=0264600) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:52:48] (step=0264700) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 21:52:57] (step=0264800) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:53:06] (step=0264900) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:53:14] (step=0265000) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-19 21:53:23] (step=0265100) Train Loss: 0.1517, Train Steps/Sec: 11.67, Grad Norm: 0.0069 +[2025-02-19 21:53:31] (step=0265200) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 21:53:42] (step=0265300) Train Loss: 0.1515, Train Steps/Sec: 9.90, Grad Norm: 0.0067 +[2025-02-19 21:53:50] (step=0265400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 21:53:59] (step=0265500) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:54:07] (step=0265600) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:54:16] (step=0265700) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 21:54:24] (step=0265800) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:54:33] (step=0265900) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:54:42] (step=0266000) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:54:51] (step=0266100) Train Loss: 0.1514, Train Steps/Sec: 11.29, Grad Norm: 0.0067 +[2025-02-19 21:55:00] (step=0266200) Train Loss: 0.1522, Train Steps/Sec: 10.26, Grad Norm: 0.0069 +[2025-02-19 21:55:09] (step=0266300) Train Loss: 0.1494, Train Steps/Sec: 11.26, Grad Norm: 0.0066 +[2025-02-19 21:55:18] (step=0266400) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:55:28] (step=0266500) Train Loss: 0.1512, Train Steps/Sec: 9.44, Grad Norm: 0.0067 +[2025-02-19 21:55:37] (step=0266600) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 21:55:46] (step=0266700) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 21:55:54] (step=0266800) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0067 +[2025-02-19 21:56:03] (step=0266900) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 21:56:11] (step=0267000) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 21:56:20] (step=0267100) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:56:28] (step=0267200) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:56:37] (step=0267300) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 21:56:46] (step=0267400) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:56:54] (step=0267500) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:57:03] (step=0267600) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 21:57:11] (step=0267700) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0068 +[2025-02-19 21:57:21] (step=0267800) Train Loss: 0.1513, Train Steps/Sec: 9.99, Grad Norm: 0.0066 +[2025-02-19 21:57:30] (step=0267900) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 21:57:39] (step=0268000) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 21:57:47] (step=0268100) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:57:56] (step=0268200) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0070 +[2025-02-19 21:58:05] (step=0268300) Train Loss: 0.1506, Train Steps/Sec: 10.62, Grad Norm: 0.0070 +[2025-02-19 21:58:15] (step=0268400) Train Loss: 0.1502, Train Steps/Sec: 10.59, Grad Norm: 0.0068 +[2025-02-19 21:58:23] (step=0268500) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 21:58:33] (step=0268600) Train Loss: 0.1509, Train Steps/Sec: 10.93, Grad Norm: 0.0070 +[2025-02-19 21:58:41] (step=0268700) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 21:58:50] (step=0268800) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 21:58:58] (step=0268900) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 21:59:08] (step=0269000) Train Loss: 0.1509, Train Steps/Sec: 10.02, Grad Norm: 0.0068 +[2025-02-19 21:59:17] (step=0269100) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:59:25] (step=0269200) Train Loss: 0.1530, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 21:59:34] (step=0269300) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 21:59:43] (step=0269400) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 21:59:51] (step=0269500) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:00:00] (step=0269600) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:00:08] (step=0269700) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:00:17] (step=0269800) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:00:26] (step=0269900) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:00:34] (step=0270000) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:00:43] (step=0270100) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 22:00:51] (step=0270200) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 22:01:01] (step=0270300) Train Loss: 0.1497, Train Steps/Sec: 10.03, Grad Norm: 0.0065 +[2025-02-19 22:01:10] (step=0270400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:01:20] (step=0270500) Train Loss: 0.1500, Train Steps/Sec: 10.28, Grad Norm: 0.0067 +[2025-02-19 22:01:29] (step=0270600) Train Loss: 0.1507, Train Steps/Sec: 10.92, Grad Norm: 0.0066 +[2025-02-19 22:01:37] (step=0270700) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:01:47] (step=0270800) Train Loss: 0.1520, Train Steps/Sec: 10.94, Grad Norm: 0.0067 +[2025-02-19 22:01:55] (step=0270900) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:02:04] (step=0271000) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:02:12] (step=0271100) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:02:21] (step=0271200) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:02:30] (step=0271300) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:02:38] (step=0271400) Train Loss: 0.1527, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:02:48] (step=0271500) Train Loss: 0.1517, Train Steps/Sec: 9.99, Grad Norm: 0.0069 +[2025-02-19 22:02:57] (step=0271600) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:03:05] (step=0271700) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:03:14] (step=0271800) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 22:03:23] (step=0271900) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:03:31] (step=0272000) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:03:40] (step=0272100) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-19 22:03:48] (step=0272200) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:03:57] (step=0272300) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 22:04:05] (step=0272400) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:04:14] (step=0272500) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:04:23] (step=0272600) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:04:33] (step=0272700) Train Loss: 0.1518, Train Steps/Sec: 9.96, Grad Norm: 0.0068 +[2025-02-19 22:04:43] (step=0272800) Train Loss: 0.1518, Train Steps/Sec: 9.79, Grad Norm: 0.0069 +[2025-02-19 22:04:52] (step=0272900) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:05:01] (step=0273000) Train Loss: 0.1508, Train Steps/Sec: 10.89, Grad Norm: 0.0068 +[2025-02-19 22:05:09] (step=0273100) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 22:05:18] (step=0273200) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:05:26] (step=0273300) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:05:35] (step=0273400) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:05:44] (step=0273500) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:05:52] (step=0273600) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:06:01] (step=0273700) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:06:09] (step=0273800) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-19 22:06:18] (step=0273900) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:06:28] (step=0274000) Train Loss: 0.1509, Train Steps/Sec: 9.98, Grad Norm: 0.0072 +[2025-02-19 22:06:37] (step=0274100) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:06:45] (step=0274200) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 22:06:54] (step=0274300) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:07:02] (step=0274400) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:07:11] (step=0274500) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:07:20] (step=0274600) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:07:28] (step=0274700) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:07:37] (step=0274800) Train Loss: 0.1501, Train Steps/Sec: 10.95, Grad Norm: 0.0069 +[2025-02-19 22:07:46] (step=0274900) Train Loss: 0.1508, Train Steps/Sec: 10.91, Grad Norm: 0.0068 +[2025-02-19 22:07:55] (step=0275000) Train Loss: 0.1500, Train Steps/Sec: 11.29, Grad Norm: 0.0067 +[2025-02-19 22:08:04] (step=0275100) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:08:13] (step=0275200) Train Loss: 0.1517, Train Steps/Sec: 10.89, Grad Norm: 0.0068 +[2025-02-19 22:08:23] (step=0275300) Train Loss: 0.1514, Train Steps/Sec: 10.07, Grad Norm: 0.0066 +[2025-02-19 22:08:32] (step=0275400) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:08:40] (step=0275500) Train Loss: 0.1529, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:08:49] (step=0275600) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:08:57] (step=0275700) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:09:06] (step=0275800) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:09:15] (step=0275900) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:09:23] (step=0276000) Train Loss: 0.1488, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:09:32] (step=0276100) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 22:09:40] (step=0276200) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:09:49] (step=0276300) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:09:58] (step=0276400) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:10:08] (step=0276500) Train Loss: 0.1511, Train Steps/Sec: 10.00, Grad Norm: 0.0071 +[2025-02-19 22:10:16] (step=0276600) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:10:25] (step=0276700) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:10:33] (step=0276800) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:10:42] (step=0276900) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 22:10:51] (step=0277000) Train Loss: 0.1512, Train Steps/Sec: 10.64, Grad Norm: 0.0069 +[2025-02-19 22:11:01] (step=0277100) Train Loss: 0.1524, Train Steps/Sec: 10.43, Grad Norm: 0.0069 +[2025-02-19 22:11:10] (step=0277200) Train Loss: 0.1498, Train Steps/Sec: 11.43, Grad Norm: 0.0068 +[2025-02-19 22:11:19] (step=0277300) Train Loss: 0.1516, Train Steps/Sec: 10.66, Grad Norm: 0.0068 +[2025-02-19 22:11:28] (step=0277400) Train Loss: 0.1513, Train Steps/Sec: 11.43, Grad Norm: 0.0066 +[2025-02-19 22:11:37] (step=0277500) Train Loss: 0.1504, Train Steps/Sec: 11.44, Grad Norm: 0.0067 +[2025-02-19 22:11:45] (step=0277600) Train Loss: 0.1522, Train Steps/Sec: 11.43, Grad Norm: 0.0069 +[2025-02-19 22:11:54] (step=0277700) Train Loss: 0.1512, Train Steps/Sec: 11.38, Grad Norm: 0.0068 +[2025-02-19 22:12:04] (step=0277800) Train Loss: 0.1518, Train Steps/Sec: 9.94, Grad Norm: 0.0067 +[2025-02-19 22:12:13] (step=0277900) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 22:12:21] (step=0278000) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 22:12:30] (step=0278100) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:12:39] (step=0278200) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:12:47] (step=0278300) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:12:56] (step=0278400) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:13:05] (step=0278500) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:13:13] (step=0278600) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:13:22] (step=0278700) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 22:13:30] (step=0278800) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 22:13:39] (step=0278900) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 22:13:49] (step=0279000) Train Loss: 0.1503, Train Steps/Sec: 9.95, Grad Norm: 0.0067 +[2025-02-19 22:13:58] (step=0279100) Train Loss: 0.1527, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 22:14:08] (step=0279200) Train Loss: 0.1506, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-19 22:14:17] (step=0279300) Train Loss: 0.1514, Train Steps/Sec: 11.24, Grad Norm: 0.0069 +[2025-02-19 22:14:25] (step=0279400) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:14:34] (step=0279500) Train Loss: 0.1511, Train Steps/Sec: 10.89, Grad Norm: 0.0067 +[2025-02-19 22:14:43] (step=0279600) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:14:52] (step=0279700) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:15:00] (step=0279800) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:15:09] (step=0279900) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:15:17] (step=0280000) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:15:18] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0280000.pt +[2025-02-19 22:15:27] (step=0280100) Train Loss: 0.1506, Train Steps/Sec: 10.83, Grad Norm: 0.0067 +[2025-02-19 22:15:35] (step=0280200) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0067 +[2025-02-19 22:15:45] (step=0280300) Train Loss: 0.1517, Train Steps/Sec: 10.03, Grad Norm: 0.0068 +[2025-02-19 22:15:54] (step=0280400) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:16:02] (step=0280500) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:16:11] (step=0280600) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0070 +[2025-02-19 22:16:20] (step=0280700) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:16:28] (step=0280800) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:16:37] (step=0280900) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0072 +[2025-02-19 22:16:45] (step=0281000) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:16:54] (step=0281100) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:17:03] (step=0281200) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0073 +[2025-02-19 22:17:11] (step=0281300) Train Loss: 0.1511, Train Steps/Sec: 11.27, Grad Norm: 0.0066 +[2025-02-19 22:17:21] (step=0281400) Train Loss: 0.1515, Train Steps/Sec: 10.30, Grad Norm: 0.0068 +[2025-02-19 22:17:31] (step=0281500) Train Loss: 0.1511, Train Steps/Sec: 9.74, Grad Norm: 0.0067 +[2025-02-19 22:17:40] (step=0281600) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:17:49] (step=0281700) Train Loss: 0.1533, Train Steps/Sec: 10.87, Grad Norm: 0.0077 +[2025-02-19 22:17:58] (step=0281800) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 22:18:07] (step=0281900) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:18:15] (step=0282000) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 22:18:24] (step=0282100) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:18:32] (step=0282200) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 22:18:41] (step=0282300) Train Loss: 0.1523, Train Steps/Sec: 11.59, Grad Norm: 0.0071 +[2025-02-19 22:18:50] (step=0282400) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 22:18:58] (step=0282500) Train Loss: 0.1514, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 22:19:07] (step=0282600) Train Loss: 0.1517, Train Steps/Sec: 11.58, Grad Norm: 0.0067 +[2025-02-19 22:19:16] (step=0282700) Train Loss: 0.1516, Train Steps/Sec: 11.54, Grad Norm: 0.0069 +[2025-02-19 22:19:26] (step=0282800) Train Loss: 0.1496, Train Steps/Sec: 9.94, Grad Norm: 0.0066 +[2025-02-19 22:19:34] (step=0282900) Train Loss: 0.1514, Train Steps/Sec: 11.53, Grad Norm: 0.0068 +[2025-02-19 22:19:43] (step=0283000) Train Loss: 0.1514, Train Steps/Sec: 11.52, Grad Norm: 0.0066 +[2025-02-19 22:19:52] (step=0283100) Train Loss: 0.1515, Train Steps/Sec: 11.53, Grad Norm: 0.0069 +[2025-02-19 22:20:00] (step=0283200) Train Loss: 0.1517, Train Steps/Sec: 11.52, Grad Norm: 0.0066 +[2025-02-19 22:20:09] (step=0283300) Train Loss: 0.1507, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 22:20:18] (step=0283400) Train Loss: 0.1518, Train Steps/Sec: 11.54, Grad Norm: 0.0066 +[2025-02-19 22:20:27] (step=0283500) Train Loss: 0.1511, Train Steps/Sec: 10.55, Grad Norm: 0.0070 +[2025-02-19 22:20:36] (step=0283600) Train Loss: 0.1499, Train Steps/Sec: 10.84, Grad Norm: 0.0067 +[2025-02-19 22:20:45] (step=0283700) Train Loss: 0.1509, Train Steps/Sec: 11.20, Grad Norm: 0.0068 +[2025-02-19 22:20:54] (step=0283800) Train Loss: 0.1516, Train Steps/Sec: 11.15, Grad Norm: 0.0070 +[2025-02-19 22:21:03] (step=0283900) Train Loss: 0.1512, Train Steps/Sec: 11.16, Grad Norm: 0.0067 +[2025-02-19 22:21:13] (step=0284000) Train Loss: 0.1494, Train Steps/Sec: 10.01, Grad Norm: 0.0066 +[2025-02-19 22:21:22] (step=0284100) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:21:30] (step=0284200) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:21:39] (step=0284300) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:21:48] (step=0284400) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 22:21:56] (step=0284500) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:22:05] (step=0284600) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:22:14] (step=0284700) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 22:22:22] (step=0284800) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:22:31] (step=0284900) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:22:39] (step=0285000) Train Loss: 0.1522, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 22:22:48] (step=0285100) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:22:57] (step=0285200) Train Loss: 0.1498, Train Steps/Sec: 11.54, Grad Norm: 0.0064 +[2025-02-19 22:23:07] (step=0285300) Train Loss: 0.1502, Train Steps/Sec: 10.01, Grad Norm: 0.0067 +[2025-02-19 22:23:15] (step=0285400) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:23:24] (step=0285500) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:23:33] (step=0285600) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:23:42] (step=0285700) Train Loss: 0.1506, Train Steps/Sec: 10.29, Grad Norm: 0.0065 +[2025-02-19 22:23:52] (step=0285800) Train Loss: 0.1504, Train Steps/Sec: 10.90, Grad Norm: 0.0067 +[2025-02-19 22:24:00] (step=0285900) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0067 +[2025-02-19 22:24:09] (step=0286000) Train Loss: 0.1512, Train Steps/Sec: 10.85, Grad Norm: 0.0068 +[2025-02-19 22:24:18] (step=0286100) Train Loss: 0.1515, Train Steps/Sec: 11.57, Grad Norm: 0.0066 +[2025-02-19 22:24:27] (step=0286200) Train Loss: 0.1507, Train Steps/Sec: 11.56, Grad Norm: 0.0066 +[2025-02-19 22:24:35] (step=0286300) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0067 +[2025-02-19 22:24:44] (step=0286400) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0066 +[2025-02-19 22:24:54] (step=0286500) Train Loss: 0.1512, Train Steps/Sec: 10.03, Grad Norm: 0.0069 +[2025-02-19 22:25:03] (step=0286600) Train Loss: 0.1497, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 22:25:11] (step=0286700) Train Loss: 0.1523, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 22:25:20] (step=0286800) Train Loss: 0.1520, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 22:25:29] (step=0286900) Train Loss: 0.1521, Train Steps/Sec: 11.54, Grad Norm: 0.0070 +[2025-02-19 22:25:37] (step=0287000) Train Loss: 0.1490, Train Steps/Sec: 11.54, Grad Norm: 0.0068 +[2025-02-19 22:25:46] (step=0287100) Train Loss: 0.1526, Train Steps/Sec: 11.54, Grad Norm: 0.0067 +[2025-02-19 22:25:55] (step=0287200) Train Loss: 0.1505, Train Steps/Sec: 11.53, Grad Norm: 0.0068 +[2025-02-19 22:26:03] (step=0287300) Train Loss: 0.1525, Train Steps/Sec: 11.54, Grad Norm: 0.0070 +[2025-02-19 22:26:12] (step=0287400) Train Loss: 0.1523, Train Steps/Sec: 11.54, Grad Norm: 0.0070 +[2025-02-19 22:26:21] (step=0287500) Train Loss: 0.1520, Train Steps/Sec: 11.54, Grad Norm: 0.0068 +[2025-02-19 22:26:29] (step=0287600) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0066 +[2025-02-19 22:26:38] (step=0287700) Train Loss: 0.1498, Train Steps/Sec: 11.50, Grad Norm: 0.0066 +[2025-02-19 22:26:48] (step=0287800) Train Loss: 0.1498, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-19 22:26:58] (step=0287900) Train Loss: 0.1507, Train Steps/Sec: 9.96, Grad Norm: 0.0066 +[2025-02-19 22:27:07] (step=0288000) Train Loss: 0.1520, Train Steps/Sec: 11.24, Grad Norm: 0.0069 +[2025-02-19 22:27:16] (step=0288100) Train Loss: 0.1497, Train Steps/Sec: 11.57, Grad Norm: 0.0069 +[2025-02-19 22:27:25] (step=0288200) Train Loss: 0.1500, Train Steps/Sec: 10.84, Grad Norm: 0.0070 +[2025-02-19 22:27:33] (step=0288300) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:27:42] (step=0288400) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 22:27:51] (step=0288500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 22:27:59] (step=0288600) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:28:08] (step=0288700) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 22:28:16] (step=0288800) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:28:25] (step=0288900) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 22:28:35] (step=0289000) Train Loss: 0.1510, Train Steps/Sec: 10.01, Grad Norm: 0.0068 +[2025-02-19 22:28:44] (step=0289100) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-19 22:28:52] (step=0289200) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:29:01] (step=0289300) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:29:09] (step=0289400) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:29:18] (step=0289500) Train Loss: 0.1519, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:29:27] (step=0289600) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:29:35] (step=0289700) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:29:44] (step=0289800) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 22:29:52] (step=0289900) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 22:30:01] (step=0290000) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0065 +[2025-02-19 22:30:11] (step=0290100) Train Loss: 0.1499, Train Steps/Sec: 10.26, Grad Norm: 0.0066 +[2025-02-19 22:30:20] (step=0290200) Train Loss: 0.1499, Train Steps/Sec: 11.21, Grad Norm: 0.0067 +[2025-02-19 22:30:30] (step=0290300) Train Loss: 0.1523, Train Steps/Sec: 10.02, Grad Norm: 0.0067 +[2025-02-19 22:30:39] (step=0290400) Train Loss: 0.1500, Train Steps/Sec: 10.87, Grad Norm: 0.0067 +[2025-02-19 22:30:48] (step=0290500) Train Loss: 0.1517, Train Steps/Sec: 11.59, Grad Norm: 0.0070 +[2025-02-19 22:30:56] (step=0290600) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 22:31:05] (step=0290700) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:31:14] (step=0290800) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:31:22] (step=0290900) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:31:31] (step=0291000) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:31:39] (step=0291100) Train Loss: 0.1482, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:31:48] (step=0291200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:31:57] (step=0291300) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 22:32:05] (step=0291400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:32:15] (step=0291500) Train Loss: 0.1508, Train Steps/Sec: 10.03, Grad Norm: 0.0068 +[2025-02-19 22:32:24] (step=0291600) Train Loss: 0.1512, Train Steps/Sec: 11.54, Grad Norm: 0.0064 +[2025-02-19 22:32:33] (step=0291700) Train Loss: 0.1514, Train Steps/Sec: 11.53, Grad Norm: 0.0065 +[2025-02-19 22:32:41] (step=0291800) Train Loss: 0.1513, Train Steps/Sec: 11.55, Grad Norm: 0.0067 +[2025-02-19 22:32:50] (step=0291900) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0065 +[2025-02-19 22:32:59] (step=0292000) Train Loss: 0.1502, Train Steps/Sec: 11.55, Grad Norm: 0.0065 +[2025-02-19 22:33:07] (step=0292100) Train Loss: 0.1512, Train Steps/Sec: 11.55, Grad Norm: 0.0067 +[2025-02-19 22:33:17] (step=0292200) Train Loss: 0.1512, Train Steps/Sec: 10.55, Grad Norm: 0.0067 +[2025-02-19 22:33:26] (step=0292300) Train Loss: 0.1513, Train Steps/Sec: 10.56, Grad Norm: 0.0067 +[2025-02-19 22:33:35] (step=0292400) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:33:43] (step=0292500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:33:53] (step=0292600) Train Loss: 0.1510, Train Steps/Sec: 10.88, Grad Norm: 0.0066 +[2025-02-19 22:34:01] (step=0292700) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:34:11] (step=0292800) Train Loss: 0.1498, Train Steps/Sec: 9.98, Grad Norm: 0.0067 +[2025-02-19 22:34:20] (step=0292900) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 22:34:28] (step=0293000) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 22:34:37] (step=0293100) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:34:46] (step=0293200) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:34:54] (step=0293300) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 22:35:03] (step=0293400) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 22:35:12] (step=0293500) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:35:20] (step=0293600) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:35:29] (step=0293700) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:35:37] (step=0293800) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:35:46] (step=0293900) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:35:56] (step=0294000) Train Loss: 0.1513, Train Steps/Sec: 9.98, Grad Norm: 0.0066 +[2025-02-19 22:36:05] (step=0294100) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0068 +[2025-02-19 22:36:13] (step=0294200) Train Loss: 0.1517, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:36:22] (step=0294300) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 22:36:31] (step=0294400) Train Loss: 0.1510, Train Steps/Sec: 10.59, Grad Norm: 0.0065 +[2025-02-19 22:36:40] (step=0294500) Train Loss: 0.1504, Train Steps/Sec: 10.88, Grad Norm: 0.0067 +[2025-02-19 22:36:49] (step=0294600) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:36:58] (step=0294700) Train Loss: 0.1509, Train Steps/Sec: 10.87, Grad Norm: 0.0066 +[2025-02-19 22:37:07] (step=0294800) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:37:16] (step=0294900) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 22:37:24] (step=0295000) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 22:37:33] (step=0295100) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:37:41] (step=0295200) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 22:37:51] (step=0295300) Train Loss: 0.1513, Train Steps/Sec: 9.96, Grad Norm: 0.0069 +[2025-02-19 22:38:00] (step=0295400) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-19 22:38:09] (step=0295500) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 22:38:17] (step=0295600) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 22:38:26] (step=0295700) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 22:38:35] (step=0295800) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 22:38:43] (step=0295900) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 22:38:52] (step=0296000) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 22:39:00] (step=0296100) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 22:39:09] (step=0296200) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 22:39:18] (step=0296300) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 22:39:26] (step=0296400) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 22:39:37] (step=0296500) Train Loss: 0.1511, Train Steps/Sec: 9.73, Grad Norm: 0.0066 +[2025-02-19 22:39:46] (step=0296600) Train Loss: 0.1501, Train Steps/Sec: 10.28, Grad Norm: 0.0066 +[2025-02-19 22:39:55] (step=0296700) Train Loss: 0.1520, Train Steps/Sec: 11.32, Grad Norm: 0.0066 +[2025-02-19 22:40:04] (step=0296800) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-19 22:40:13] (step=0296900) Train Loss: 0.1506, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-19 22:40:22] (step=0297000) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 22:40:30] (step=0297100) Train Loss: 0.1501, Train Steps/Sec: 11.67, Grad Norm: 0.0068 +[2025-02-19 22:40:39] (step=0297200) Train Loss: 0.1520, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 22:40:47] (step=0297300) Train Loss: 0.1521, Train Steps/Sec: 11.66, Grad Norm: 0.0068 +[2025-02-19 22:40:56] (step=0297400) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:41:04] (step=0297500) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-19 22:41:13] (step=0297600) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:41:22] (step=0297700) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0070 +[2025-02-19 22:41:32] (step=0297800) Train Loss: 0.1505, Train Steps/Sec: 9.96, Grad Norm: 0.0069 +[2025-02-19 22:41:40] (step=0297900) Train Loss: 0.1523, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-19 22:41:49] (step=0298000) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:41:57] (step=0298100) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-19 22:42:06] (step=0298200) Train Loss: 0.1519, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-19 22:42:15] (step=0298300) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-19 22:42:23] (step=0298400) Train Loss: 0.1489, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 22:42:32] (step=0298500) Train Loss: 0.1520, Train Steps/Sec: 11.66, Grad Norm: 0.0069 +[2025-02-19 22:42:40] (step=0298600) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 22:42:49] (step=0298700) Train Loss: 0.1514, Train Steps/Sec: 10.94, Grad Norm: 0.0065 +[2025-02-19 22:42:59] (step=0298800) Train Loss: 0.1510, Train Steps/Sec: 10.61, Grad Norm: 0.0068 +[2025-02-19 22:43:08] (step=0298900) Train Loss: 0.1513, Train Steps/Sec: 11.30, Grad Norm: 0.0066 +[2025-02-19 22:43:18] (step=0299000) Train Loss: 0.1508, Train Steps/Sec: 9.98, Grad Norm: 0.0064 +[2025-02-19 22:43:27] (step=0299100) Train Loss: 0.1506, Train Steps/Sec: 10.89, Grad Norm: 0.0066 +[2025-02-19 22:43:36] (step=0299200) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:43:44] (step=0299300) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:43:53] (step=0299400) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:44:01] (step=0299500) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 22:44:10] (step=0299600) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:44:19] (step=0299700) Train Loss: 0.1520, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:44:27] (step=0299800) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:44:36] (step=0299900) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:44:44] (step=0300000) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:44:45] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0300000.pt +[2025-02-19 22:44:54] (step=0300100) Train Loss: 0.1515, Train Steps/Sec: 10.61, Grad Norm: 0.0067 +[2025-02-19 22:45:02] (step=0300200) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:45:12] (step=0300300) Train Loss: 0.1520, Train Steps/Sec: 9.87, Grad Norm: 0.0068 +[2025-02-19 22:45:21] (step=0300400) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-19 22:45:30] (step=0300500) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-19 22:45:38] (step=0300600) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-19 22:45:47] (step=0300700) Train Loss: 0.1520, Train Steps/Sec: 11.67, Grad Norm: 0.0067 +[2025-02-19 22:45:55] (step=0300800) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-19 22:46:05] (step=0300900) Train Loss: 0.1503, Train Steps/Sec: 10.64, Grad Norm: 0.0066 +[2025-02-19 22:46:14] (step=0301000) Train Loss: 0.1507, Train Steps/Sec: 10.61, Grad Norm: 0.0067 +[2025-02-19 22:46:23] (step=0301100) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:46:32] (step=0301200) Train Loss: 0.1501, Train Steps/Sec: 11.24, Grad Norm: 0.0068 +[2025-02-19 22:46:41] (step=0301300) Train Loss: 0.1507, Train Steps/Sec: 11.26, Grad Norm: 0.0068 +[2025-02-19 22:46:49] (step=0301400) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:46:59] (step=0301500) Train Loss: 0.1507, Train Steps/Sec: 10.02, Grad Norm: 0.0066 +[2025-02-19 22:47:08] (step=0301600) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:47:16] (step=0301700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:47:25] (step=0301800) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 22:47:34] (step=0301900) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:47:42] (step=0302000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:47:51] (step=0302100) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:47:59] (step=0302200) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:48:08] (step=0302300) Train Loss: 0.1523, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 22:48:17] (step=0302400) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:48:25] (step=0302500) Train Loss: 0.1488, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 22:48:34] (step=0302600) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:48:42] (step=0302700) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0070 +[2025-02-19 22:48:52] (step=0302800) Train Loss: 0.1507, Train Steps/Sec: 10.01, Grad Norm: 0.0066 +[2025-02-19 22:49:01] (step=0302900) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:49:10] (step=0303000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:49:20] (step=0303100) Train Loss: 0.1513, Train Steps/Sec: 10.00, Grad Norm: 0.0069 +[2025-02-19 22:49:29] (step=0303200) Train Loss: 0.1502, Train Steps/Sec: 11.27, Grad Norm: 0.0070 +[2025-02-19 22:49:37] (step=0303300) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:49:46] (step=0303400) Train Loss: 0.1515, Train Steps/Sec: 10.90, Grad Norm: 0.0068 +[2025-02-19 22:49:55] (step=0303500) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:50:03] (step=0303600) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:50:12] (step=0303700) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:50:21] (step=0303800) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 22:50:29] (step=0303900) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 22:50:39] (step=0304000) Train Loss: 0.1506, Train Steps/Sec: 10.01, Grad Norm: 0.0071 +[2025-02-19 22:50:48] (step=0304100) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:50:56] (step=0304200) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:51:05] (step=0304300) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:51:14] (step=0304400) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:51:22] (step=0304500) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:51:31] (step=0304600) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:51:39] (step=0304700) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:51:48] (step=0304800) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:51:57] (step=0304900) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0069 +[2025-02-19 22:52:05] (step=0305000) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:52:14] (step=0305100) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:52:22] (step=0305200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:52:34] (step=0305300) Train Loss: 0.1503, Train Steps/Sec: 8.77, Grad Norm: 0.0066 +[2025-02-19 22:52:43] (step=0305400) Train Loss: 0.1516, Train Steps/Sec: 11.28, Grad Norm: 0.0065 +[2025-02-19 22:52:51] (step=0305500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:53:01] (step=0305600) Train Loss: 0.1498, Train Steps/Sec: 10.89, Grad Norm: 0.0066 +[2025-02-19 22:53:09] (step=0305700) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:53:18] (step=0305800) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 22:53:26] (step=0305900) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:53:35] (step=0306000) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-19 22:53:44] (step=0306100) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:53:52] (step=0306200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:54:01] (step=0306300) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:54:09] (step=0306400) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:54:19] (step=0306500) Train Loss: 0.1513, Train Steps/Sec: 9.92, Grad Norm: 0.0067 +[2025-02-19 22:54:28] (step=0306600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:54:37] (step=0306700) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:54:45] (step=0306800) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:54:54] (step=0306900) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:55:02] (step=0307000) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:55:11] (step=0307100) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 22:55:20] (step=0307200) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 22:55:28] (step=0307300) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:55:38] (step=0307400) Train Loss: 0.1508, Train Steps/Sec: 10.60, Grad Norm: 0.0067 +[2025-02-19 22:55:47] (step=0307500) Train Loss: 0.1517, Train Steps/Sec: 10.91, Grad Norm: 0.0069 +[2025-02-19 22:55:56] (step=0307600) Train Loss: 0.1508, Train Steps/Sec: 11.28, Grad Norm: 0.0066 +[2025-02-19 22:56:04] (step=0307700) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 22:56:15] (step=0307800) Train Loss: 0.1497, Train Steps/Sec: 9.40, Grad Norm: 0.0065 +[2025-02-19 22:56:24] (step=0307900) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 22:56:32] (step=0308000) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 22:56:41] (step=0308100) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 22:56:49] (step=0308200) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 22:56:58] (step=0308300) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 22:57:07] (step=0308400) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:57:15] (step=0308500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 22:57:24] (step=0308600) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 22:57:32] (step=0308700) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-19 22:57:41] (step=0308800) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:57:50] (step=0308900) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:58:00] (step=0309000) Train Loss: 0.1502, Train Steps/Sec: 9.94, Grad Norm: 0.0066 +[2025-02-19 22:58:08] (step=0309100) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:58:17] (step=0309200) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 22:58:25] (step=0309300) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 22:58:34] (step=0309400) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 22:58:43] (step=0309500) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 22:58:52] (step=0309600) Train Loss: 0.1527, Train Steps/Sec: 10.61, Grad Norm: 0.0067 +[2025-02-19 22:59:01] (step=0309700) Train Loss: 0.1522, Train Steps/Sec: 10.64, Grad Norm: 0.0068 +[2025-02-19 22:59:10] (step=0309800) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 22:59:19] (step=0309900) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:59:28] (step=0310000) Train Loss: 0.1505, Train Steps/Sec: 10.87, Grad Norm: 0.0066 +[2025-02-19 22:59:36] (step=0310100) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 22:59:45] (step=0310200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 22:59:55] (step=0310300) Train Loss: 0.1494, Train Steps/Sec: 10.00, Grad Norm: 0.0066 +[2025-02-19 23:00:04] (step=0310400) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:00:12] (step=0310500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 23:00:21] (step=0310600) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:00:30] (step=0310700) Train Loss: 0.1510, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 23:00:38] (step=0310800) Train Loss: 0.1510, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 23:00:47] (step=0310900) Train Loss: 0.1518, Train Steps/Sec: 11.51, Grad Norm: 0.0067 +[2025-02-19 23:00:56] (step=0311000) Train Loss: 0.1500, Train Steps/Sec: 11.52, Grad Norm: 0.0065 +[2025-02-19 23:01:04] (step=0311100) Train Loss: 0.1509, Train Steps/Sec: 11.52, Grad Norm: 0.0067 +[2025-02-19 23:01:13] (step=0311200) Train Loss: 0.1511, Train Steps/Sec: 11.54, Grad Norm: 0.0065 +[2025-02-19 23:01:22] (step=0311300) Train Loss: 0.1519, Train Steps/Sec: 11.54, Grad Norm: 0.0068 +[2025-02-19 23:01:30] (step=0311400) Train Loss: 0.1509, Train Steps/Sec: 11.55, Grad Norm: 0.0066 +[2025-02-19 23:01:40] (step=0311500) Train Loss: 0.1512, Train Steps/Sec: 9.95, Grad Norm: 0.0066 +[2025-02-19 23:01:49] (step=0311600) Train Loss: 0.1521, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 23:01:58] (step=0311700) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:02:08] (step=0311800) Train Loss: 0.1516, Train Steps/Sec: 9.91, Grad Norm: 0.0066 +[2025-02-19 23:02:17] (step=0311900) Train Loss: 0.1514, Train Steps/Sec: 11.13, Grad Norm: 0.0067 +[2025-02-19 23:02:25] (step=0312000) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:02:34] (step=0312100) Train Loss: 0.1499, Train Steps/Sec: 11.29, Grad Norm: 0.0066 +[2025-02-19 23:02:43] (step=0312200) Train Loss: 0.1521, Train Steps/Sec: 11.27, Grad Norm: 0.0065 +[2025-02-19 23:02:52] (step=0312300) Train Loss: 0.1522, Train Steps/Sec: 11.66, Grad Norm: 0.0066 +[2025-02-19 23:03:00] (step=0312400) Train Loss: 0.1520, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 23:03:09] (step=0312500) Train Loss: 0.1519, Train Steps/Sec: 11.50, Grad Norm: 0.0066 +[2025-02-19 23:03:17] (step=0312600) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-19 23:03:26] (step=0312700) Train Loss: 0.1505, Train Steps/Sec: 11.47, Grad Norm: 0.0067 +[2025-02-19 23:03:36] (step=0312800) Train Loss: 0.1515, Train Steps/Sec: 9.83, Grad Norm: 0.0067 +[2025-02-19 23:03:45] (step=0312900) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:03:54] (step=0313000) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 23:04:02] (step=0313100) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:04:11] (step=0313200) Train Loss: 0.1521, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:04:19] (step=0313300) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:04:28] (step=0313400) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:04:37] (step=0313500) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:04:45] (step=0313600) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 23:04:54] (step=0313700) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:05:03] (step=0313800) Train Loss: 0.1526, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 23:05:11] (step=0313900) Train Loss: 0.1494, Train Steps/Sec: 11.27, Grad Norm: 0.0064 +[2025-02-19 23:05:21] (step=0314000) Train Loss: 0.1506, Train Steps/Sec: 10.21, Grad Norm: 0.0069 +[2025-02-19 23:05:31] (step=0314100) Train Loss: 0.1504, Train Steps/Sec: 9.77, Grad Norm: 0.0066 +[2025-02-19 23:05:40] (step=0314200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:05:49] (step=0314300) Train Loss: 0.1520, Train Steps/Sec: 11.26, Grad Norm: 0.0066 +[2025-02-19 23:05:58] (step=0314400) Train Loss: 0.1502, Train Steps/Sec: 11.21, Grad Norm: 0.0065 +[2025-02-19 23:06:07] (step=0314500) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 23:06:15] (step=0314600) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:06:24] (step=0314700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:06:32] (step=0314800) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:06:41] (step=0314900) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 23:06:50] (step=0315000) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0071 +[2025-02-19 23:06:58] (step=0315100) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 23:07:07] (step=0315200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 23:07:17] (step=0315300) Train Loss: 0.1510, Train Steps/Sec: 9.98, Grad Norm: 0.0065 +[2025-02-19 23:07:25] (step=0315400) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:07:34] (step=0315500) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 23:07:43] (step=0315600) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-19 23:07:51] (step=0315700) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:08:00] (step=0315800) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:08:09] (step=0315900) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-19 23:08:17] (step=0316000) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:08:27] (step=0316100) Train Loss: 0.1510, Train Steps/Sec: 10.61, Grad Norm: 0.0065 +[2025-02-19 23:08:36] (step=0316200) Train Loss: 0.1504, Train Steps/Sec: 10.61, Grad Norm: 0.0065 +[2025-02-19 23:08:45] (step=0316300) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:08:53] (step=0316400) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 23:09:02] (step=0316500) Train Loss: 0.1501, Train Steps/Sec: 10.89, Grad Norm: 0.0067 +[2025-02-19 23:09:12] (step=0316600) Train Loss: 0.1499, Train Steps/Sec: 10.00, Grad Norm: 0.0065 +[2025-02-19 23:09:21] (step=0316700) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 23:09:30] (step=0316800) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:09:38] (step=0316900) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-19 23:09:47] (step=0317000) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 23:09:55] (step=0317100) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:10:04] (step=0317200) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:10:13] (step=0317300) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 23:10:21] (step=0317400) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:10:30] (step=0317500) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 23:10:38] (step=0317600) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:10:47] (step=0317700) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:10:57] (step=0317800) Train Loss: 0.1516, Train Steps/Sec: 10.03, Grad Norm: 0.0068 +[2025-02-19 23:11:06] (step=0317900) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:11:14] (step=0318000) Train Loss: 0.1517, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:11:23] (step=0318100) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 23:11:31] (step=0318200) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0068 +[2025-02-19 23:11:41] (step=0318300) Train Loss: 0.1505, Train Steps/Sec: 10.29, Grad Norm: 0.0067 +[2025-02-19 23:11:50] (step=0318400) Train Loss: 0.1503, Train Steps/Sec: 10.93, Grad Norm: 0.0068 +[2025-02-19 23:11:59] (step=0318500) Train Loss: 0.1525, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:12:07] (step=0318600) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:12:17] (step=0318700) Train Loss: 0.1508, Train Steps/Sec: 10.93, Grad Norm: 0.0066 +[2025-02-19 23:12:25] (step=0318800) Train Loss: 0.1530, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 23:12:34] (step=0318900) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 23:12:42] (step=0319000) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:12:52] (step=0319100) Train Loss: 0.1488, Train Steps/Sec: 10.00, Grad Norm: 0.0067 +[2025-02-19 23:13:01] (step=0319200) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:13:10] (step=0319300) Train Loss: 0.1526, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:13:18] (step=0319400) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:13:27] (step=0319500) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 23:13:35] (step=0319600) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:13:44] (step=0319700) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:13:53] (step=0319800) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 23:14:01] (step=0319900) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:14:10] (step=0320000) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-19 23:14:11] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0320000.pt +[2025-02-19 23:14:19] (step=0320100) Train Loss: 0.1507, Train Steps/Sec: 10.44, Grad Norm: 0.0065 +[2025-02-19 23:14:28] (step=0320200) Train Loss: 0.1522, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 23:14:38] (step=0320300) Train Loss: 0.1500, Train Steps/Sec: 10.01, Grad Norm: 0.0065 +[2025-02-19 23:14:47] (step=0320400) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 23:14:57] (step=0320500) Train Loss: 0.1503, Train Steps/Sec: 9.98, Grad Norm: 0.0067 +[2025-02-19 23:15:06] (step=0320600) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0067 +[2025-02-19 23:15:14] (step=0320700) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:15:23] (step=0320800) Train Loss: 0.1501, Train Steps/Sec: 11.29, Grad Norm: 0.0065 +[2025-02-19 23:15:32] (step=0320900) Train Loss: 0.1504, Train Steps/Sec: 11.24, Grad Norm: 0.0067 +[2025-02-19 23:15:40] (step=0321000) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-19 23:15:49] (step=0321100) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:15:58] (step=0321200) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 23:16:06] (step=0321300) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:16:15] (step=0321400) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 23:16:24] (step=0321500) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 23:16:34] (step=0321600) Train Loss: 0.1509, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-19 23:16:42] (step=0321700) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:16:51] (step=0321800) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 23:16:59] (step=0321900) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:17:08] (step=0322000) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:17:17] (step=0322100) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:17:25] (step=0322200) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:17:34] (step=0322300) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:17:42] (step=0322400) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 23:17:51] (step=0322500) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-19 23:18:00] (step=0322600) Train Loss: 0.1510, Train Steps/Sec: 11.31, Grad Norm: 0.0066 +[2025-02-19 23:18:10] (step=0322700) Train Loss: 0.1509, Train Steps/Sec: 10.28, Grad Norm: 0.0068 +[2025-02-19 23:18:20] (step=0322800) Train Loss: 0.1513, Train Steps/Sec: 9.74, Grad Norm: 0.0066 +[2025-02-19 23:18:28] (step=0322900) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:18:37] (step=0323000) Train Loss: 0.1515, Train Steps/Sec: 11.27, Grad Norm: 0.0065 +[2025-02-19 23:18:46] (step=0323100) Train Loss: 0.1500, Train Steps/Sec: 11.22, Grad Norm: 0.0065 +[2025-02-19 23:18:55] (step=0323200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:19:03] (step=0323300) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:19:12] (step=0323400) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:19:21] (step=0323500) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 23:19:29] (step=0323600) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 23:19:38] (step=0323700) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-19 23:19:46] (step=0323800) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-19 23:19:55] (step=0323900) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 23:20:04] (step=0324000) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-19 23:20:14] (step=0324100) Train Loss: 0.1520, Train Steps/Sec: 10.08, Grad Norm: 0.0067 +[2025-02-19 23:20:22] (step=0324200) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:20:31] (step=0324300) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:20:39] (step=0324400) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:20:48] (step=0324500) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0071 +[2025-02-19 23:20:57] (step=0324600) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:21:05] (step=0324700) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:21:15] (step=0324800) Train Loss: 0.1514, Train Steps/Sec: 10.59, Grad Norm: 0.0067 +[2025-02-19 23:21:24] (step=0324900) Train Loss: 0.1509, Train Steps/Sec: 10.62, Grad Norm: 0.0068 +[2025-02-19 23:21:33] (step=0325000) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 23:21:41] (step=0325100) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:21:50] (step=0325200) Train Loss: 0.1519, Train Steps/Sec: 10.92, Grad Norm: 0.0066 +[2025-02-19 23:22:00] (step=0325300) Train Loss: 0.1526, Train Steps/Sec: 10.01, Grad Norm: 0.0067 +[2025-02-19 23:22:09] (step=0325400) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 23:22:18] (step=0325500) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:22:26] (step=0325600) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:22:35] (step=0325700) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0068 +[2025-02-19 23:22:43] (step=0325800) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:22:52] (step=0325900) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:23:01] (step=0326000) Train Loss: 0.1525, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-19 23:23:09] (step=0326100) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:23:18] (step=0326200) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 23:23:26] (step=0326300) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 23:23:35] (step=0326400) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-19 23:23:44] (step=0326500) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 23:23:53] (step=0326600) Train Loss: 0.1514, Train Steps/Sec: 10.03, Grad Norm: 0.0066 +[2025-02-19 23:24:02] (step=0326700) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:24:11] (step=0326800) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:24:19] (step=0326900) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:24:29] (step=0327000) Train Loss: 0.1497, Train Steps/Sec: 9.89, Grad Norm: 0.0064 +[2025-02-19 23:24:38] (step=0327100) Train Loss: 0.1511, Train Steps/Sec: 11.21, Grad Norm: 0.0066 +[2025-02-19 23:24:47] (step=0327200) Train Loss: 0.1502, Train Steps/Sec: 11.53, Grad Norm: 0.0066 +[2025-02-19 23:24:56] (step=0327300) Train Loss: 0.1517, Train Steps/Sec: 11.54, Grad Norm: 0.0067 +[2025-02-19 23:25:05] (step=0327400) Train Loss: 0.1519, Train Steps/Sec: 10.85, Grad Norm: 0.0065 +[2025-02-19 23:25:14] (step=0327500) Train Loss: 0.1517, Train Steps/Sec: 11.54, Grad Norm: 0.0067 +[2025-02-19 23:25:22] (step=0327600) Train Loss: 0.1515, Train Steps/Sec: 11.55, Grad Norm: 0.0065 +[2025-02-19 23:25:31] (step=0327700) Train Loss: 0.1507, Train Steps/Sec: 11.54, Grad Norm: 0.0065 +[2025-02-19 23:25:41] (step=0327800) Train Loss: 0.1519, Train Steps/Sec: 9.98, Grad Norm: 0.0068 +[2025-02-19 23:25:50] (step=0327900) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:25:58] (step=0328000) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:26:07] (step=0328100) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:26:15] (step=0328200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:26:24] (step=0328300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:26:33] (step=0328400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:26:41] (step=0328500) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:26:50] (step=0328600) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0072 +[2025-02-19 23:26:58] (step=0328700) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 23:27:07] (step=0328800) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-19 23:27:16] (step=0328900) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:27:24] (step=0329000) Train Loss: 0.1509, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-19 23:27:35] (step=0329100) Train Loss: 0.1507, Train Steps/Sec: 9.71, Grad Norm: 0.0064 +[2025-02-19 23:27:44] (step=0329200) Train Loss: 0.1515, Train Steps/Sec: 10.26, Grad Norm: 0.0065 +[2025-02-19 23:27:53] (step=0329300) Train Loss: 0.1509, Train Steps/Sec: 11.29, Grad Norm: 0.0065 +[2025-02-19 23:28:02] (step=0329400) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-19 23:28:11] (step=0329500) Train Loss: 0.1514, Train Steps/Sec: 11.28, Grad Norm: 0.0064 +[2025-02-19 23:28:20] (step=0329600) Train Loss: 0.1504, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-19 23:28:28] (step=0329700) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-19 23:28:37] (step=0329800) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 23:28:45] (step=0329900) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:28:54] (step=0330000) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:29:03] (step=0330100) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:29:11] (step=0330200) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:29:21] (step=0330300) Train Loss: 0.1491, Train Steps/Sec: 9.93, Grad Norm: 0.0064 +[2025-02-19 23:29:30] (step=0330400) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 23:29:39] (step=0330500) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 23:29:47] (step=0330600) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:29:56] (step=0330700) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:30:04] (step=0330800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:30:13] (step=0330900) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:30:22] (step=0331000) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:30:30] (step=0331100) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:30:39] (step=0331200) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-19 23:30:48] (step=0331300) Train Loss: 0.1508, Train Steps/Sec: 10.57, Grad Norm: 0.0065 +[2025-02-19 23:30:57] (step=0331400) Train Loss: 0.1501, Train Steps/Sec: 10.88, Grad Norm: 0.0067 +[2025-02-19 23:31:06] (step=0331500) Train Loss: 0.1516, Train Steps/Sec: 11.01, Grad Norm: 0.0065 +[2025-02-19 23:31:17] (step=0331600) Train Loss: 0.1517, Train Steps/Sec: 9.91, Grad Norm: 0.0063 +[2025-02-19 23:31:25] (step=0331700) Train Loss: 0.1507, Train Steps/Sec: 11.24, Grad Norm: 0.0066 +[2025-02-19 23:31:34] (step=0331800) Train Loss: 0.1493, Train Steps/Sec: 11.19, Grad Norm: 0.0066 +[2025-02-19 23:31:43] (step=0331900) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:31:52] (step=0332000) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:32:00] (step=0332100) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:32:09] (step=0332200) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:32:17] (step=0332300) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:32:26] (step=0332400) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:32:35] (step=0332500) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:32:43] (step=0332600) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:32:52] (step=0332700) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:33:02] (step=0332800) Train Loss: 0.1503, Train Steps/Sec: 9.98, Grad Norm: 0.0067 +[2025-02-19 23:33:11] (step=0332900) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-19 23:33:19] (step=0333000) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-19 23:33:28] (step=0333100) Train Loss: 0.1523, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-19 23:33:36] (step=0333200) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 23:33:45] (step=0333300) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:33:54] (step=0333400) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:34:03] (step=0333500) Train Loss: 0.1522, Train Steps/Sec: 10.59, Grad Norm: 0.0066 +[2025-02-19 23:34:13] (step=0333600) Train Loss: 0.1502, Train Steps/Sec: 10.56, Grad Norm: 0.0065 +[2025-02-19 23:34:21] (step=0333700) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:34:30] (step=0333800) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:34:39] (step=0333900) Train Loss: 0.1514, Train Steps/Sec: 10.89, Grad Norm: 0.0066 +[2025-02-19 23:34:48] (step=0334000) Train Loss: 0.1505, Train Steps/Sec: 11.55, Grad Norm: 0.0064 +[2025-02-19 23:34:58] (step=0334100) Train Loss: 0.1515, Train Steps/Sec: 10.03, Grad Norm: 0.0066 +[2025-02-19 23:35:06] (step=0334200) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:35:15] (step=0334300) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:35:23] (step=0334400) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-19 23:35:32] (step=0334500) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 23:35:41] (step=0334600) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:35:49] (step=0334700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:35:58] (step=0334800) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:36:07] (step=0334900) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:36:15] (step=0335000) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:36:24] (step=0335100) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:36:32] (step=0335200) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0069 +[2025-02-19 23:36:42] (step=0335300) Train Loss: 0.1500, Train Steps/Sec: 10.01, Grad Norm: 0.0063 +[2025-02-19 23:36:51] (step=0335400) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-19 23:37:00] (step=0335500) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:37:08] (step=0335600) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:37:18] (step=0335700) Train Loss: 0.1507, Train Steps/Sec: 10.01, Grad Norm: 0.0064 +[2025-02-19 23:37:27] (step=0335800) Train Loss: 0.1505, Train Steps/Sec: 11.25, Grad Norm: 0.0065 +[2025-02-19 23:37:36] (step=0335900) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 23:37:44] (step=0336000) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:37:53] (step=0336100) Train Loss: 0.1508, Train Steps/Sec: 10.90, Grad Norm: 0.0065 +[2025-02-19 23:38:02] (step=0336200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 23:38:11] (step=0336300) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:38:19] (step=0336400) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 23:38:28] (step=0336500) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0065 +[2025-02-19 23:38:38] (step=0336600) Train Loss: 0.1501, Train Steps/Sec: 10.01, Grad Norm: 0.0066 +[2025-02-19 23:38:47] (step=0336700) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:38:55] (step=0336800) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:39:04] (step=0336900) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:39:12] (step=0337000) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:39:21] (step=0337100) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:39:30] (step=0337200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 23:39:38] (step=0337300) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:39:47] (step=0337400) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:39:55] (step=0337500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-19 23:40:04] (step=0337600) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:40:13] (step=0337700) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:40:23] (step=0337800) Train Loss: 0.1500, Train Steps/Sec: 9.72, Grad Norm: 0.0066 +[2025-02-19 23:40:33] (step=0337900) Train Loss: 0.1495, Train Steps/Sec: 10.24, Grad Norm: 0.0064 +[2025-02-19 23:40:42] (step=0338000) Train Loss: 0.1508, Train Steps/Sec: 11.23, Grad Norm: 0.0066 +[2025-02-19 23:40:50] (step=0338100) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:40:59] (step=0338200) Train Loss: 0.1511, Train Steps/Sec: 11.28, Grad Norm: 0.0064 +[2025-02-19 23:41:08] (step=0338300) Train Loss: 0.1506, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-19 23:41:17] (step=0338400) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-19 23:41:25] (step=0338500) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:41:34] (step=0338600) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 23:41:42] (step=0338700) Train Loss: 0.1520, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-19 23:41:51] (step=0338800) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:42:00] (step=0338900) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:42:08] (step=0339000) Train Loss: 0.1514, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-19 23:42:18] (step=0339100) Train Loss: 0.1503, Train Steps/Sec: 9.98, Grad Norm: 0.0066 +[2025-02-19 23:42:27] (step=0339200) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-19 23:42:36] (step=0339300) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-19 23:42:44] (step=0339400) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-19 23:42:53] (step=0339500) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-19 23:43:02] (step=0339600) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:43:10] (step=0339700) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:43:19] (step=0339800) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:43:27] (step=0339900) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 23:43:37] (step=0340000) Train Loss: 0.1497, Train Steps/Sec: 10.58, Grad Norm: 0.0065 +[2025-02-19 23:43:38] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0340000.pt +[2025-02-19 23:43:47] (step=0340100) Train Loss: 0.1515, Train Steps/Sec: 10.04, Grad Norm: 0.0067 +[2025-02-19 23:43:56] (step=0340200) Train Loss: 0.1501, Train Steps/Sec: 11.23, Grad Norm: 0.0066 +[2025-02-19 23:44:06] (step=0340300) Train Loss: 0.1499, Train Steps/Sec: 9.96, Grad Norm: 0.0069 +[2025-02-19 23:44:15] (step=0340400) Train Loss: 0.1496, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-19 23:44:24] (step=0340500) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:44:32] (step=0340600) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-19 23:44:41] (step=0340700) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:44:49] (step=0340800) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:44:58] (step=0340900) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-19 23:45:07] (step=0341000) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:45:15] (step=0341100) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:45:24] (step=0341200) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:45:32] (step=0341300) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-19 23:45:41] (step=0341400) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:45:50] (step=0341500) Train Loss: 0.1512, Train Steps/Sec: 11.56, Grad Norm: 0.0066 +[2025-02-19 23:46:00] (step=0341600) Train Loss: 0.1519, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-19 23:46:08] (step=0341700) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:46:17] (step=0341800) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:46:26] (step=0341900) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:46:34] (step=0342000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:46:43] (step=0342100) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:46:52] (step=0342200) Train Loss: 0.1498, Train Steps/Sec: 10.62, Grad Norm: 0.0065 +[2025-02-19 23:47:02] (step=0342300) Train Loss: 0.1508, Train Steps/Sec: 10.59, Grad Norm: 0.0064 +[2025-02-19 23:47:10] (step=0342400) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 23:47:19] (step=0342500) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:47:28] (step=0342600) Train Loss: 0.1495, Train Steps/Sec: 10.90, Grad Norm: 0.0066 +[2025-02-19 23:47:37] (step=0342700) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-19 23:47:47] (step=0342800) Train Loss: 0.1507, Train Steps/Sec: 9.96, Grad Norm: 0.0065 +[2025-02-19 23:47:55] (step=0342900) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:48:04] (step=0343000) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-19 23:48:13] (step=0343100) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:48:21] (step=0343200) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:48:30] (step=0343300) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:48:38] (step=0343400) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:48:47] (step=0343500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-19 23:48:56] (step=0343600) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:49:04] (step=0343700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:49:13] (step=0343800) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:49:21] (step=0343900) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:49:30] (step=0344000) Train Loss: 0.1513, Train Steps/Sec: 11.57, Grad Norm: 0.0067 +[2025-02-19 23:49:40] (step=0344100) Train Loss: 0.1496, Train Steps/Sec: 10.00, Grad Norm: 0.0065 +[2025-02-19 23:49:49] (step=0344200) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:49:57] (step=0344300) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:50:07] (step=0344400) Train Loss: 0.1501, Train Steps/Sec: 10.01, Grad Norm: 0.0065 +[2025-02-19 23:50:16] (step=0344500) Train Loss: 0.1512, Train Steps/Sec: 11.23, Grad Norm: 0.0065 +[2025-02-19 23:50:25] (step=0344600) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:50:34] (step=0344700) Train Loss: 0.1512, Train Steps/Sec: 11.26, Grad Norm: 0.0067 +[2025-02-19 23:50:43] (step=0344800) Train Loss: 0.1517, Train Steps/Sec: 11.21, Grad Norm: 0.0067 +[2025-02-19 23:50:51] (step=0344900) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-19 23:51:00] (step=0345000) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:51:08] (step=0345100) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:51:17] (step=0345200) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-19 23:51:27] (step=0345300) Train Loss: 0.1508, Train Steps/Sec: 10.01, Grad Norm: 0.0064 +[2025-02-19 23:51:36] (step=0345400) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-19 23:51:44] (step=0345500) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-19 23:51:53] (step=0345600) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 23:52:02] (step=0345700) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-19 23:52:10] (step=0345800) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:52:19] (step=0345900) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 23:52:27] (step=0346000) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 23:52:36] (step=0346100) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-19 23:52:45] (step=0346200) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-19 23:52:53] (step=0346300) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:53:02] (step=0346400) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:53:11] (step=0346500) Train Loss: 0.1500, Train Steps/Sec: 11.21, Grad Norm: 0.0068 +[2025-02-19 23:53:22] (step=0346600) Train Loss: 0.1518, Train Steps/Sec: 8.99, Grad Norm: 0.0064 +[2025-02-19 23:53:31] (step=0346700) Train Loss: 0.1503, Train Steps/Sec: 11.21, Grad Norm: 0.0066 +[2025-02-19 23:53:40] (step=0346800) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 23:53:48] (step=0346900) Train Loss: 0.1508, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-19 23:53:57] (step=0347000) Train Loss: 0.1519, Train Steps/Sec: 11.20, Grad Norm: 0.0068 +[2025-02-19 23:54:06] (step=0347100) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-19 23:54:15] (step=0347200) Train Loss: 0.1515, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-19 23:54:23] (step=0347300) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-19 23:54:32] (step=0347400) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:54:40] (step=0347500) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:54:49] (step=0347600) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:54:58] (step=0347700) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:55:08] (step=0347800) Train Loss: 0.1504, Train Steps/Sec: 9.92, Grad Norm: 0.0066 +[2025-02-19 23:55:16] (step=0347900) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:55:25] (step=0348000) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:55:34] (step=0348100) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-19 23:55:42] (step=0348200) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:55:51] (step=0348300) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:55:59] (step=0348400) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:56:08] (step=0348500) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:56:17] (step=0348600) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:56:26] (step=0348700) Train Loss: 0.1505, Train Steps/Sec: 10.57, Grad Norm: 0.0064 +[2025-02-19 23:56:35] (step=0348800) Train Loss: 0.1508, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-19 23:56:44] (step=0348900) Train Loss: 0.1502, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-19 23:56:53] (step=0349000) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0067 +[2025-02-19 23:57:03] (step=0349100) Train Loss: 0.1501, Train Steps/Sec: 9.44, Grad Norm: 0.0068 +[2025-02-19 23:57:12] (step=0349200) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:57:21] (step=0349300) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-19 23:57:29] (step=0349400) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-19 23:57:38] (step=0349500) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:57:47] (step=0349600) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:57:55] (step=0349700) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:58:04] (step=0349800) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0070 +[2025-02-19 23:58:12] (step=0349900) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-19 23:58:21] (step=0350000) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-19 23:58:30] (step=0350100) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-19 23:58:38] (step=0350200) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:58:48] (step=0350300) Train Loss: 0.1515, Train Steps/Sec: 9.94, Grad Norm: 0.0062 +[2025-02-19 23:58:57] (step=0350400) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:59:06] (step=0350500) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-19 23:59:14] (step=0350600) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:59:23] (step=0350700) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-19 23:59:31] (step=0350800) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-19 23:59:41] (step=0350900) Train Loss: 0.1526, Train Steps/Sec: 10.31, Grad Norm: 0.0068 +[2025-02-19 23:59:50] (step=0351000) Train Loss: 0.1513, Train Steps/Sec: 10.87, Grad Norm: 0.0067 +[2025-02-19 23:59:59] (step=0351100) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:00:08] (step=0351200) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:00:17] (step=0351300) Train Loss: 0.1515, Train Steps/Sec: 10.90, Grad Norm: 0.0066 +[2025-02-20 00:00:25] (step=0351400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:00:34] (step=0351500) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 00:00:44] (step=0351600) Train Loss: 0.1516, Train Steps/Sec: 10.00, Grad Norm: 0.0063 +[2025-02-20 00:00:53] (step=0351700) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:01:01] (step=0351800) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:01:10] (step=0351900) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 00:01:18] (step=0352000) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0068 +[2025-02-20 00:01:27] (step=0352100) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:01:36] (step=0352200) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:01:44] (step=0352300) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:01:53] (step=0352400) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:02:01] (step=0352500) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:02:10] (step=0352600) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:02:19] (step=0352700) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:02:29] (step=0352800) Train Loss: 0.1500, Train Steps/Sec: 10.02, Grad Norm: 0.0064 +[2025-02-20 00:02:37] (step=0352900) Train Loss: 0.1514, Train Steps/Sec: 11.57, Grad Norm: 0.0066 +[2025-02-20 00:02:46] (step=0353000) Train Loss: 0.1489, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 00:02:56] (step=0353100) Train Loss: 0.1515, Train Steps/Sec: 9.96, Grad Norm: 0.0064 +[2025-02-20 00:03:05] (step=0353200) Train Loss: 0.1510, Train Steps/Sec: 11.18, Grad Norm: 0.0067 +[2025-02-20 00:03:14] (step=0353300) Train Loss: 0.1501, Train Steps/Sec: 11.52, Grad Norm: 0.0066 +[2025-02-20 00:03:22] (step=0353400) Train Loss: 0.1510, Train Steps/Sec: 11.21, Grad Norm: 0.0065 +[2025-02-20 00:03:31] (step=0353500) Train Loss: 0.1499, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-20 00:03:40] (step=0353600) Train Loss: 0.1514, Train Steps/Sec: 11.56, Grad Norm: 0.0065 +[2025-02-20 00:03:49] (step=0353700) Train Loss: 0.1504, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 00:03:57] (step=0353800) Train Loss: 0.1515, Train Steps/Sec: 11.57, Grad Norm: 0.0067 +[2025-02-20 00:04:06] (step=0353900) Train Loss: 0.1502, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 00:04:15] (step=0354000) Train Loss: 0.1507, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 00:04:25] (step=0354100) Train Loss: 0.1505, Train Steps/Sec: 9.97, Grad Norm: 0.0066 +[2025-02-20 00:04:33] (step=0354200) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:04:42] (step=0354300) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:04:51] (step=0354400) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:04:59] (step=0354500) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:05:08] (step=0354600) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-20 00:05:16] (step=0354700) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-20 00:05:25] (step=0354800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:05:34] (step=0354900) Train Loss: 0.1522, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:05:42] (step=0355000) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:05:51] (step=0355100) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:06:00] (step=0355200) Train Loss: 0.1509, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-20 00:06:11] (step=0355300) Train Loss: 0.1509, Train Steps/Sec: 9.14, Grad Norm: 0.0065 +[2025-02-20 00:06:20] (step=0355400) Train Loss: 0.1503, Train Steps/Sec: 11.22, Grad Norm: 0.0065 +[2025-02-20 00:06:28] (step=0355500) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:06:38] (step=0355600) Train Loss: 0.1507, Train Steps/Sec: 10.88, Grad Norm: 0.0065 +[2025-02-20 00:06:46] (step=0355700) Train Loss: 0.1521, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:06:55] (step=0355800) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:07:04] (step=0355900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:07:12] (step=0356000) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:07:21] (step=0356100) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 00:07:29] (step=0356200) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:07:38] (step=0356300) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:07:47] (step=0356400) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:07:55] (step=0356500) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 00:08:05] (step=0356600) Train Loss: 0.1505, Train Steps/Sec: 9.90, Grad Norm: 0.0064 +[2025-02-20 00:08:14] (step=0356700) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:08:23] (step=0356800) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 00:08:31] (step=0356900) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-20 00:08:40] (step=0357000) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:08:48] (step=0357100) Train Loss: 0.1523, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-20 00:08:57] (step=0357200) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:09:06] (step=0357300) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:09:15] (step=0357400) Train Loss: 0.1513, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-20 00:09:24] (step=0357500) Train Loss: 0.1501, Train Steps/Sec: 10.91, Grad Norm: 0.0065 +[2025-02-20 00:09:33] (step=0357600) Train Loss: 0.1514, Train Steps/Sec: 11.22, Grad Norm: 0.0065 +[2025-02-20 00:09:42] (step=0357700) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 00:09:52] (step=0357800) Train Loss: 0.1508, Train Steps/Sec: 9.44, Grad Norm: 0.0063 +[2025-02-20 00:10:01] (step=0357900) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:10:10] (step=0358000) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:10:18] (step=0358100) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 00:10:27] (step=0358200) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 00:10:35] (step=0358300) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:10:44] (step=0358400) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:10:53] (step=0358500) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0069 +[2025-02-20 00:11:01] (step=0358600) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:11:10] (step=0358700) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 00:11:19] (step=0358800) Train Loss: 0.1523, Train Steps/Sec: 11.60, Grad Norm: 0.0068 +[2025-02-20 00:11:27] (step=0358900) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:11:36] (step=0359000) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 00:11:46] (step=0359100) Train Loss: 0.1515, Train Steps/Sec: 9.96, Grad Norm: 0.0064 +[2025-02-20 00:11:55] (step=0359200) Train Loss: 0.1491, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 00:12:03] (step=0359300) Train Loss: 0.1526, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 00:12:12] (step=0359400) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 00:12:20] (step=0359500) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 00:12:30] (step=0359600) Train Loss: 0.1504, Train Steps/Sec: 10.30, Grad Norm: 0.0064 +[2025-02-20 00:12:39] (step=0359700) Train Loss: 0.1508, Train Steps/Sec: 10.88, Grad Norm: 0.0065 +[2025-02-20 00:12:48] (step=0359800) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 00:12:57] (step=0359900) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:13:06] (step=0360000) Train Loss: 0.1514, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-20 00:13:07] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0360000.pt +[2025-02-20 00:13:15] (step=0360100) Train Loss: 0.1518, Train Steps/Sec: 10.24, Grad Norm: 0.0063 +[2025-02-20 00:13:24] (step=0360200) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:13:34] (step=0360300) Train Loss: 0.1509, Train Steps/Sec: 9.95, Grad Norm: 0.0067 +[2025-02-20 00:13:43] (step=0360400) Train Loss: 0.1515, Train Steps/Sec: 11.56, Grad Norm: 0.0067 +[2025-02-20 00:13:51] (step=0360500) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 00:14:00] (step=0360600) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 00:14:09] (step=0360700) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0068 +[2025-02-20 00:14:17] (step=0360800) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 00:14:26] (step=0360900) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 00:14:35] (step=0361000) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 00:14:43] (step=0361100) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:14:52] (step=0361200) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-20 00:15:01] (step=0361300) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0069 +[2025-02-20 00:15:09] (step=0361400) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 00:15:18] (step=0361500) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-20 00:15:28] (step=0361600) Train Loss: 0.1510, Train Steps/Sec: 10.01, Grad Norm: 0.0065 +[2025-02-20 00:15:37] (step=0361700) Train Loss: 0.1504, Train Steps/Sec: 11.27, Grad Norm: 0.0065 +[2025-02-20 00:15:46] (step=0361800) Train Loss: 0.1501, Train Steps/Sec: 10.28, Grad Norm: 0.0066 +[2025-02-20 00:15:55] (step=0361900) Train Loss: 0.1500, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 00:16:04] (step=0362000) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:16:13] (step=0362100) Train Loss: 0.1519, Train Steps/Sec: 11.30, Grad Norm: 0.0067 +[2025-02-20 00:16:22] (step=0362200) Train Loss: 0.1494, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 00:16:30] (step=0362300) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-20 00:16:39] (step=0362400) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:16:47] (step=0362500) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:16:56] (step=0362600) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:17:05] (step=0362700) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:17:15] (step=0362800) Train Loss: 0.1508, Train Steps/Sec: 9.96, Grad Norm: 0.0064 +[2025-02-20 00:17:23] (step=0362900) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 00:17:32] (step=0363000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:17:40] (step=0363100) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:17:49] (step=0363200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:17:58] (step=0363300) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:18:06] (step=0363400) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 00:18:15] (step=0363500) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:18:24] (step=0363600) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:18:32] (step=0363700) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:18:41] (step=0363800) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:18:50] (step=0363900) Train Loss: 0.1502, Train Steps/Sec: 10.59, Grad Norm: 0.0066 +[2025-02-20 00:18:59] (step=0364000) Train Loss: 0.1498, Train Steps/Sec: 10.93, Grad Norm: 0.0064 +[2025-02-20 00:19:10] (step=0364100) Train Loss: 0.1509, Train Steps/Sec: 9.72, Grad Norm: 0.0066 +[2025-02-20 00:19:18] (step=0364200) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:19:27] (step=0364300) Train Loss: 0.1511, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-20 00:19:36] (step=0364400) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:19:45] (step=0364500) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 00:19:53] (step=0364600) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:20:02] (step=0364700) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:20:10] (step=0364800) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:20:19] (step=0364900) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:20:28] (step=0365000) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:20:36] (step=0365100) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:20:45] (step=0365200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:20:55] (step=0365300) Train Loss: 0.1512, Train Steps/Sec: 9.97, Grad Norm: 0.0064 +[2025-02-20 00:21:03] (step=0365400) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:21:12] (step=0365500) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 00:21:21] (step=0365600) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:21:29] (step=0365700) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:21:38] (step=0365800) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:21:46] (step=0365900) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:21:55] (step=0366000) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:22:05] (step=0366100) Train Loss: 0.1506, Train Steps/Sec: 10.61, Grad Norm: 0.0065 +[2025-02-20 00:22:14] (step=0366200) Train Loss: 0.1497, Train Steps/Sec: 10.58, Grad Norm: 0.0065 +[2025-02-20 00:22:23] (step=0366300) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:22:31] (step=0366400) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:22:40] (step=0366500) Train Loss: 0.1499, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 00:22:50] (step=0366600) Train Loss: 0.1503, Train Steps/Sec: 9.98, Grad Norm: 0.0066 +[2025-02-20 00:22:59] (step=0366700) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:23:08] (step=0366800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:23:16] (step=0366900) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:23:25] (step=0367000) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:23:33] (step=0367100) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:23:42] (step=0367200) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:23:51] (step=0367300) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:23:59] (step=0367400) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:24:08] (step=0367500) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:24:16] (step=0367600) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:24:25] (step=0367700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 00:24:35] (step=0367800) Train Loss: 0.1501, Train Steps/Sec: 9.96, Grad Norm: 0.0062 +[2025-02-20 00:24:44] (step=0367900) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:24:52] (step=0368000) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:25:01] (step=0368100) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 00:25:09] (step=0368200) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:25:19] (step=0368300) Train Loss: 0.1513, Train Steps/Sec: 10.34, Grad Norm: 0.0064 +[2025-02-20 00:25:28] (step=0368400) Train Loss: 0.1506, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 00:25:37] (step=0368500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:25:46] (step=0368600) Train Loss: 0.1515, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 00:25:55] (step=0368700) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0066 +[2025-02-20 00:26:03] (step=0368800) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:26:12] (step=0368900) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 00:26:21] (step=0369000) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 00:26:30] (step=0369100) Train Loss: 0.1514, Train Steps/Sec: 10.04, Grad Norm: 0.0066 +[2025-02-20 00:26:39] (step=0369200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:26:48] (step=0369300) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:26:56] (step=0369400) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:27:05] (step=0369500) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 00:27:13] (step=0369600) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:27:22] (step=0369700) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:27:31] (step=0369800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:27:39] (step=0369900) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:27:48] (step=0370000) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:27:56] (step=0370100) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:28:05] (step=0370200) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:28:15] (step=0370300) Train Loss: 0.1511, Train Steps/Sec: 10.02, Grad Norm: 0.0064 +[2025-02-20 00:28:24] (step=0370400) Train Loss: 0.1500, Train Steps/Sec: 10.90, Grad Norm: 0.0065 +[2025-02-20 00:28:34] (step=0370500) Train Loss: 0.1498, Train Steps/Sec: 10.56, Grad Norm: 0.0064 +[2025-02-20 00:28:43] (step=0370600) Train Loss: 0.1512, Train Steps/Sec: 11.19, Grad Norm: 0.0063 +[2025-02-20 00:28:51] (step=0370700) Train Loss: 0.1506, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 00:29:01] (step=0370800) Train Loss: 0.1498, Train Steps/Sec: 10.87, Grad Norm: 0.0066 +[2025-02-20 00:29:09] (step=0370900) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:29:18] (step=0371000) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:29:26] (step=0371100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:29:35] (step=0371200) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 00:29:44] (step=0371300) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:29:52] (step=0371400) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:30:01] (step=0371500) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 00:30:11] (step=0371600) Train Loss: 0.1517, Train Steps/Sec: 9.92, Grad Norm: 0.0064 +[2025-02-20 00:30:20] (step=0371700) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 00:30:28] (step=0371800) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 00:30:37] (step=0371900) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:30:45] (step=0372000) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:30:54] (step=0372100) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:31:03] (step=0372200) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:31:11] (step=0372300) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:31:20] (step=0372400) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:31:28] (step=0372500) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:31:38] (step=0372600) Train Loss: 0.1507, Train Steps/Sec: 10.58, Grad Norm: 0.0064 +[2025-02-20 00:31:47] (step=0372700) Train Loss: 0.1507, Train Steps/Sec: 10.91, Grad Norm: 0.0064 +[2025-02-20 00:31:57] (step=0372800) Train Loss: 0.1504, Train Steps/Sec: 9.68, Grad Norm: 0.0062 +[2025-02-20 00:32:06] (step=0372900) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:32:15] (step=0373000) Train Loss: 0.1505, Train Steps/Sec: 10.91, Grad Norm: 0.0064 +[2025-02-20 00:32:24] (step=0373100) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:32:32] (step=0373200) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:32:41] (step=0373300) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:32:50] (step=0373400) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:32:58] (step=0373500) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:33:07] (step=0373600) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:33:15] (step=0373700) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-20 00:33:24] (step=0373800) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:33:33] (step=0373900) Train Loss: 0.1518, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 00:33:41] (step=0374000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:33:51] (step=0374100) Train Loss: 0.1509, Train Steps/Sec: 10.03, Grad Norm: 0.0064 +[2025-02-20 00:34:00] (step=0374200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:34:08] (step=0374300) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 00:34:17] (step=0374400) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:34:26] (step=0374500) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:34:34] (step=0374600) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 00:34:43] (step=0374700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:34:53] (step=0374800) Train Loss: 0.1503, Train Steps/Sec: 10.31, Grad Norm: 0.0062 +[2025-02-20 00:35:02] (step=0374900) Train Loss: 0.1493, Train Steps/Sec: 10.88, Grad Norm: 0.0065 +[2025-02-20 00:35:10] (step=0375000) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:35:19] (step=0375100) Train Loss: 0.1496, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 00:35:28] (step=0375200) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0065 +[2025-02-20 00:35:37] (step=0375300) Train Loss: 0.1511, Train Steps/Sec: 10.85, Grad Norm: 0.0066 +[2025-02-20 00:35:47] (step=0375400) Train Loss: 0.1513, Train Steps/Sec: 10.61, Grad Norm: 0.0063 +[2025-02-20 00:35:55] (step=0375500) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 00:36:04] (step=0375600) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 00:36:13] (step=0375700) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:36:21] (step=0375800) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:36:30] (step=0375900) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 00:36:38] (step=0376000) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 00:36:47] (step=0376100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-20 00:36:56] (step=0376200) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 00:37:04] (step=0376300) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 00:37:13] (step=0376400) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:37:22] (step=0376500) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:37:32] (step=0376600) Train Loss: 0.1513, Train Steps/Sec: 9.93, Grad Norm: 0.0065 +[2025-02-20 00:37:40] (step=0376700) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 00:37:49] (step=0376800) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 00:37:57] (step=0376900) Train Loss: 0.1511, Train Steps/Sec: 11.58, Grad Norm: 0.0067 +[2025-02-20 00:38:07] (step=0377000) Train Loss: 0.1508, Train Steps/Sec: 10.31, Grad Norm: 0.0066 +[2025-02-20 00:38:16] (step=0377100) Train Loss: 0.1510, Train Steps/Sec: 10.88, Grad Norm: 0.0067 +[2025-02-20 00:38:25] (step=0377200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:38:34] (step=0377300) Train Loss: 0.1510, Train Steps/Sec: 11.23, Grad Norm: 0.0065 +[2025-02-20 00:38:43] (step=0377400) Train Loss: 0.1503, Train Steps/Sec: 11.26, Grad Norm: 0.0065 +[2025-02-20 00:38:51] (step=0377500) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:39:00] (step=0377600) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:39:09] (step=0377700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:39:17] (step=0377800) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0066 +[2025-02-20 00:39:27] (step=0377900) Train Loss: 0.1503, Train Steps/Sec: 9.91, Grad Norm: 0.0062 +[2025-02-20 00:39:36] (step=0378000) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 00:39:45] (step=0378100) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:39:53] (step=0378200) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 00:40:02] (step=0378300) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 00:40:10] (step=0378400) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 00:40:19] (step=0378500) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 00:40:28] (step=0378600) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:40:36] (step=0378700) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 00:40:45] (step=0378800) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:40:54] (step=0378900) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 00:41:02] (step=0379000) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 00:41:13] (step=0379100) Train Loss: 0.1498, Train Steps/Sec: 9.26, Grad Norm: 0.0064 +[2025-02-20 00:41:22] (step=0379200) Train Loss: 0.1495, Train Steps/Sec: 10.88, Grad Norm: 0.0063 +[2025-02-20 00:41:31] (step=0379300) Train Loss: 0.1507, Train Steps/Sec: 11.21, Grad Norm: 0.0068 +[2025-02-20 00:41:40] (step=0379400) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 00:41:49] (step=0379500) Train Loss: 0.1504, Train Steps/Sec: 10.88, Grad Norm: 0.0063 +[2025-02-20 00:41:58] (step=0379600) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 00:42:06] (step=0379700) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0066 +[2025-02-20 00:42:15] (step=0379800) Train Loss: 0.1497, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 00:42:24] (step=0379900) Train Loss: 0.1509, Train Steps/Sec: 11.55, Grad Norm: 0.0064 +[2025-02-20 00:42:32] (step=0380000) Train Loss: 0.1502, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 00:42:33] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0380000.pt +[2025-02-20 00:42:42] (step=0380100) Train Loss: 0.1509, Train Steps/Sec: 10.10, Grad Norm: 0.0064 +[2025-02-20 00:42:51] (step=0380200) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0066 +[2025-02-20 00:42:59] (step=0380300) Train Loss: 0.1512, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 00:43:09] (step=0380400) Train Loss: 0.1508, Train Steps/Sec: 10.02, Grad Norm: 0.0065 +[2025-02-20 00:43:18] (step=0380500) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 00:43:27] (step=0380600) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:43:35] (step=0380700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:43:44] (step=0380800) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 00:43:52] (step=0380900) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:44:01] (step=0381000) Train Loss: 0.1529, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-20 00:44:10] (step=0381100) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:44:18] (step=0381200) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:44:28] (step=0381300) Train Loss: 0.1511, Train Steps/Sec: 10.61, Grad Norm: 0.0063 +[2025-02-20 00:44:37] (step=0381400) Train Loss: 0.1490, Train Steps/Sec: 10.58, Grad Norm: 0.0064 +[2025-02-20 00:44:46] (step=0381500) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:44:56] (step=0381600) Train Loss: 0.1506, Train Steps/Sec: 9.73, Grad Norm: 0.0062 +[2025-02-20 00:45:05] (step=0381700) Train Loss: 0.1495, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 00:45:13] (step=0381800) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:45:22] (step=0381900) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:45:31] (step=0382000) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:45:39] (step=0382100) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:45:48] (step=0382200) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:45:56] (step=0382300) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:46:05] (step=0382400) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:46:14] (step=0382500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 00:46:22] (step=0382600) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:46:31] (step=0382700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:46:39] (step=0382800) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 00:46:50] (step=0382900) Train Loss: 0.1510, Train Steps/Sec: 9.93, Grad Norm: 0.0063 +[2025-02-20 00:46:58] (step=0383000) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:47:07] (step=0383100) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:47:15] (step=0383200) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:47:24] (step=0383300) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 00:47:33] (step=0383400) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 00:47:42] (step=0383500) Train Loss: 0.1513, Train Steps/Sec: 10.36, Grad Norm: 0.0063 +[2025-02-20 00:47:51] (step=0383600) Train Loss: 0.1507, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-20 00:48:00] (step=0383700) Train Loss: 0.1517, Train Steps/Sec: 11.49, Grad Norm: 0.0064 +[2025-02-20 00:48:09] (step=0383800) Train Loss: 0.1496, Train Steps/Sec: 11.07, Grad Norm: 0.0061 +[2025-02-20 00:48:18] (step=0383900) Train Loss: 0.1513, Train Steps/Sec: 11.12, Grad Norm: 0.0063 +[2025-02-20 00:48:27] (step=0384000) Train Loss: 0.1511, Train Steps/Sec: 11.46, Grad Norm: 0.0064 +[2025-02-20 00:48:37] (step=0384100) Train Loss: 0.1504, Train Steps/Sec: 9.86, Grad Norm: 0.0066 +[2025-02-20 00:48:46] (step=0384200) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 00:48:54] (step=0384300) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0065 +[2025-02-20 00:49:03] (step=0384400) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 00:49:12] (step=0384500) Train Loss: 0.1490, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 00:49:20] (step=0384600) Train Loss: 0.1511, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 00:49:29] (step=0384700) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 00:49:37] (step=0384800) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-20 00:49:46] (step=0384900) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:49:55] (step=0385000) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 00:50:03] (step=0385100) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 00:50:12] (step=0385200) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 00:50:20] (step=0385300) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 00:50:30] (step=0385400) Train Loss: 0.1512, Train Steps/Sec: 10.06, Grad Norm: 0.0066 +[2025-02-20 00:50:39] (step=0385500) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:50:48] (step=0385600) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 00:50:58] (step=0385700) Train Loss: 0.1524, Train Steps/Sec: 10.02, Grad Norm: 0.0065 +[2025-02-20 00:51:06] (step=0385800) Train Loss: 0.1493, Train Steps/Sec: 11.25, Grad Norm: 0.0066 +[2025-02-20 00:51:15] (step=0385900) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 00:51:24] (step=0386000) Train Loss: 0.1509, Train Steps/Sec: 11.24, Grad Norm: 0.0066 +[2025-02-20 00:51:33] (step=0386100) Train Loss: 0.1505, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 00:51:41] (step=0386200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:51:50] (step=0386300) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 00:51:59] (step=0386400) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 00:52:07] (step=0386500) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 00:52:17] (step=0386600) Train Loss: 0.1492, Train Steps/Sec: 9.99, Grad Norm: 0.0065 +[2025-02-20 00:52:26] (step=0386700) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:52:34] (step=0386800) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:52:43] (step=0386900) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:52:52] (step=0387000) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:53:00] (step=0387100) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:53:09] (step=0387200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:53:17] (step=0387300) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:53:26] (step=0387400) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 00:53:35] (step=0387500) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:53:43] (step=0387600) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 00:53:52] (step=0387700) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:54:01] (step=0387800) Train Loss: 0.1493, Train Steps/Sec: 10.91, Grad Norm: 0.0062 +[2025-02-20 00:54:12] (step=0387900) Train Loss: 0.1503, Train Steps/Sec: 9.05, Grad Norm: 0.0064 +[2025-02-20 00:54:21] (step=0388000) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:54:29] (step=0388100) Train Loss: 0.1523, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-20 00:54:38] (step=0388200) Train Loss: 0.1495, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 00:54:47] (step=0388300) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:54:56] (step=0388400) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:55:04] (step=0388500) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 00:55:13] (step=0388600) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 00:55:21] (step=0388700) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:55:30] (step=0388800) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:55:39] (step=0388900) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:55:47] (step=0389000) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 00:55:57] (step=0389100) Train Loss: 0.1500, Train Steps/Sec: 9.90, Grad Norm: 0.0064 +[2025-02-20 00:56:06] (step=0389200) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 00:56:14] (step=0389300) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 00:56:23] (step=0389400) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 00:56:32] (step=0389500) Train Loss: 0.1521, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:56:40] (step=0389600) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-20 00:56:49] (step=0389700) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:56:57] (step=0389800) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:57:06] (step=0389900) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 00:57:15] (step=0390000) Train Loss: 0.1499, Train Steps/Sec: 10.61, Grad Norm: 0.0062 +[2025-02-20 00:57:25] (step=0390100) Train Loss: 0.1509, Train Steps/Sec: 10.57, Grad Norm: 0.0067 +[2025-02-20 00:57:34] (step=0390200) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 00:57:42] (step=0390300) Train Loss: 0.1500, Train Steps/Sec: 11.20, Grad Norm: 0.0063 +[2025-02-20 00:57:53] (step=0390400) Train Loss: 0.1520, Train Steps/Sec: 9.79, Grad Norm: 0.0063 +[2025-02-20 00:58:01] (step=0390500) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 00:58:10] (step=0390600) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 00:58:18] (step=0390700) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 00:58:27] (step=0390800) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 00:58:36] (step=0390900) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 00:58:44] (step=0391000) Train Loss: 0.1485, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 00:58:53] (step=0391100) Train Loss: 0.1524, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:59:01] (step=0391200) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:59:10] (step=0391300) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 00:59:19] (step=0391400) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 00:59:27] (step=0391500) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 00:59:37] (step=0391600) Train Loss: 0.1490, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 00:59:46] (step=0391700) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 00:59:54] (step=0391800) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:00:03] (step=0391900) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 01:00:12] (step=0392000) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 01:00:20] (step=0392100) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 01:00:30] (step=0392200) Train Loss: 0.1504, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-20 01:00:39] (step=0392300) Train Loss: 0.1497, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-20 01:00:48] (step=0392400) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:00:57] (step=0392500) Train Loss: 0.1502, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-20 01:01:05] (step=0392600) Train Loss: 0.1504, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 01:01:14] (step=0392700) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:01:23] (step=0392800) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 01:01:33] (step=0392900) Train Loss: 0.1500, Train Steps/Sec: 9.86, Grad Norm: 0.0063 +[2025-02-20 01:01:41] (step=0393000) Train Loss: 0.1504, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 01:01:50] (step=0393100) Train Loss: 0.1518, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 01:01:59] (step=0393200) Train Loss: 0.1504, Train Steps/Sec: 11.56, Grad Norm: 0.0065 +[2025-02-20 01:02:07] (step=0393300) Train Loss: 0.1512, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 01:02:16] (step=0393400) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 01:02:25] (step=0393500) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 01:02:33] (step=0393600) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-20 01:02:42] (step=0393700) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 01:02:51] (step=0393800) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 01:02:59] (step=0393900) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 01:03:08] (step=0394000) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 01:03:18] (step=0394100) Train Loss: 0.1505, Train Steps/Sec: 10.05, Grad Norm: 0.0064 +[2025-02-20 01:03:26] (step=0394200) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 01:03:36] (step=0394300) Train Loss: 0.1495, Train Steps/Sec: 10.86, Grad Norm: 0.0063 +[2025-02-20 01:03:45] (step=0394400) Train Loss: 0.1512, Train Steps/Sec: 10.62, Grad Norm: 0.0064 +[2025-02-20 01:03:54] (step=0394500) Train Loss: 0.1506, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-20 01:04:03] (step=0394600) Train Loss: 0.1508, Train Steps/Sec: 11.25, Grad Norm: 0.0064 +[2025-02-20 01:04:11] (step=0394700) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:04:20] (step=0394800) Train Loss: 0.1503, Train Steps/Sec: 11.30, Grad Norm: 0.0064 +[2025-02-20 01:04:29] (step=0394900) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 01:04:37] (step=0395000) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:04:46] (step=0395100) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 01:04:55] (step=0395200) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:05:03] (step=0395300) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 01:05:13] (step=0395400) Train Loss: 0.1510, Train Steps/Sec: 10.04, Grad Norm: 0.0062 +[2025-02-20 01:05:22] (step=0395500) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:05:30] (step=0395600) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:05:39] (step=0395700) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 01:05:48] (step=0395800) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 01:05:56] (step=0395900) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:06:05] (step=0396000) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:06:13] (step=0396100) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-20 01:06:22] (step=0396200) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 01:06:31] (step=0396300) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 01:06:39] (step=0396400) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:06:49] (step=0396500) Train Loss: 0.1506, Train Steps/Sec: 10.62, Grad Norm: 0.0064 +[2025-02-20 01:06:59] (step=0396600) Train Loss: 0.1506, Train Steps/Sec: 9.22, Grad Norm: 0.0064 +[2025-02-20 01:07:08] (step=0396700) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 01:07:17] (step=0396800) Train Loss: 0.1509, Train Steps/Sec: 11.26, Grad Norm: 0.0065 +[2025-02-20 01:07:26] (step=0396900) Train Loss: 0.1509, Train Steps/Sec: 11.28, Grad Norm: 0.0065 +[2025-02-20 01:07:34] (step=0397000) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:07:43] (step=0397100) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:07:52] (step=0397200) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:08:00] (step=0397300) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 01:08:09] (step=0397400) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:08:17] (step=0397500) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:08:26] (step=0397600) Train Loss: 0.1493, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 01:08:34] (step=0397700) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:08:43] (step=0397800) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 01:08:53] (step=0397900) Train Loss: 0.1494, Train Steps/Sec: 9.94, Grad Norm: 0.0064 +[2025-02-20 01:09:02] (step=0398000) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 01:09:10] (step=0398100) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 01:09:19] (step=0398200) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-20 01:09:28] (step=0398300) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:09:36] (step=0398400) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:09:45] (step=0398500) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:09:53] (step=0398600) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 01:10:03] (step=0398700) Train Loss: 0.1499, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-20 01:10:12] (step=0398800) Train Loss: 0.1502, Train Steps/Sec: 10.60, Grad Norm: 0.0065 +[2025-02-20 01:10:21] (step=0398900) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 01:10:30] (step=0399000) Train Loss: 0.1513, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 01:10:40] (step=0399100) Train Loss: 0.1510, Train Steps/Sec: 9.71, Grad Norm: 0.0065 +[2025-02-20 01:10:49] (step=0399200) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 01:10:57] (step=0399300) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:11:06] (step=0399400) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 01:11:14] (step=0399500) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:11:23] (step=0399600) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:11:32] (step=0399700) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:11:40] (step=0399800) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:11:49] (step=0399900) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 01:11:57] (step=0400000) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 01:11:59] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0400000.pt +[2025-02-20 01:12:07] (step=0400100) Train Loss: 0.1499, Train Steps/Sec: 10.12, Grad Norm: 0.0063 +[2025-02-20 01:12:16] (step=0400200) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 01:12:25] (step=0400300) Train Loss: 0.1513, Train Steps/Sec: 11.55, Grad Norm: 0.0064 +[2025-02-20 01:12:34] (step=0400400) Train Loss: 0.1501, Train Steps/Sec: 10.06, Grad Norm: 0.0064 +[2025-02-20 01:12:43] (step=0400500) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:12:52] (step=0400600) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 01:13:00] (step=0400700) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 01:13:09] (step=0400800) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 01:13:19] (step=0400900) Train Loss: 0.1504, Train Steps/Sec: 10.31, Grad Norm: 0.0062 +[2025-02-20 01:13:28] (step=0401000) Train Loss: 0.1501, Train Steps/Sec: 10.76, Grad Norm: 0.0063 +[2025-02-20 01:13:37] (step=0401100) Train Loss: 0.1497, Train Steps/Sec: 11.13, Grad Norm: 0.0063 +[2025-02-20 01:13:46] (step=0401200) Train Loss: 0.1508, Train Steps/Sec: 11.47, Grad Norm: 0.0064 +[2025-02-20 01:13:55] (step=0401300) Train Loss: 0.1500, Train Steps/Sec: 11.14, Grad Norm: 0.0064 +[2025-02-20 01:14:03] (step=0401400) Train Loss: 0.1503, Train Steps/Sec: 11.49, Grad Norm: 0.0067 +[2025-02-20 01:14:12] (step=0401500) Train Loss: 0.1520, Train Steps/Sec: 11.48, Grad Norm: 0.0063 +[2025-02-20 01:14:22] (step=0401600) Train Loss: 0.1510, Train Steps/Sec: 9.91, Grad Norm: 0.0065 +[2025-02-20 01:14:31] (step=0401700) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:14:39] (step=0401800) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:14:48] (step=0401900) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-20 01:14:56] (step=0402000) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:15:05] (step=0402100) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:15:14] (step=0402200) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:15:22] (step=0402300) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:15:31] (step=0402400) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:15:39] (step=0402500) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:15:48] (step=0402600) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:15:57] (step=0402700) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:16:05] (step=0402800) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 01:16:15] (step=0402900) Train Loss: 0.1508, Train Steps/Sec: 10.04, Grad Norm: 0.0063 +[2025-02-20 01:16:25] (step=0403000) Train Loss: 0.1502, Train Steps/Sec: 10.54, Grad Norm: 0.0062 +[2025-02-20 01:16:34] (step=0403100) Train Loss: 0.1512, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 01:16:43] (step=0403200) Train Loss: 0.1523, Train Steps/Sec: 10.88, Grad Norm: 0.0064 +[2025-02-20 01:16:52] (step=0403300) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0066 +[2025-02-20 01:17:00] (step=0403400) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 01:17:09] (step=0403500) Train Loss: 0.1500, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 01:17:18] (step=0403600) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 01:17:26] (step=0403700) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 01:17:35] (step=0403800) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 01:17:44] (step=0403900) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-20 01:17:52] (step=0404000) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 01:18:02] (step=0404100) Train Loss: 0.1498, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 01:18:11] (step=0404200) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 01:18:19] (step=0404300) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 01:18:28] (step=0404400) Train Loss: 0.1507, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 01:18:37] (step=0404500) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 01:18:45] (step=0404600) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 01:18:54] (step=0404700) Train Loss: 0.1513, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 01:19:03] (step=0404800) Train Loss: 0.1510, Train Steps/Sec: 11.56, Grad Norm: 0.0065 +[2025-02-20 01:19:11] (step=0404900) Train Loss: 0.1510, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 01:19:20] (step=0405000) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 01:19:29] (step=0405100) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 01:19:38] (step=0405200) Train Loss: 0.1515, Train Steps/Sec: 10.53, Grad Norm: 0.0064 +[2025-02-20 01:19:48] (step=0405300) Train Loss: 0.1503, Train Steps/Sec: 10.51, Grad Norm: 0.0065 +[2025-02-20 01:19:58] (step=0405400) Train Loss: 0.1506, Train Steps/Sec: 9.72, Grad Norm: 0.0064 +[2025-02-20 01:20:07] (step=0405500) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:20:15] (step=0405600) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:20:24] (step=0405700) Train Loss: 0.1514, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 01:20:33] (step=0405800) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:20:41] (step=0405900) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:20:50] (step=0406000) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:20:58] (step=0406100) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:21:07] (step=0406200) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 01:21:16] (step=0406300) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-20 01:21:24] (step=0406400) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:21:33] (step=0406500) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 01:21:43] (step=0406600) Train Loss: 0.1498, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 01:21:51] (step=0406700) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:22:00] (step=0406800) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:22:09] (step=0406900) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:22:17] (step=0407000) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:22:26] (step=0407100) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 01:22:34] (step=0407200) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:22:43] (step=0407300) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 01:22:53] (step=0407400) Train Loss: 0.1499, Train Steps/Sec: 10.35, Grad Norm: 0.0066 +[2025-02-20 01:23:02] (step=0407500) Train Loss: 0.1498, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-20 01:23:11] (step=0407600) Train Loss: 0.1488, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 01:23:19] (step=0407700) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:23:28] (step=0407800) Train Loss: 0.1507, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 01:23:38] (step=0407900) Train Loss: 0.1512, Train Steps/Sec: 9.97, Grad Norm: 0.0064 +[2025-02-20 01:23:47] (step=0408000) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:23:55] (step=0408100) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:24:04] (step=0408200) Train Loss: 0.1510, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 01:24:13] (step=0408300) Train Loss: 0.1517, Train Steps/Sec: 11.50, Grad Norm: 0.0063 +[2025-02-20 01:24:21] (step=0408400) Train Loss: 0.1512, Train Steps/Sec: 11.50, Grad Norm: 0.0064 +[2025-02-20 01:24:30] (step=0408500) Train Loss: 0.1512, Train Steps/Sec: 11.49, Grad Norm: 0.0063 +[2025-02-20 01:24:39] (step=0408600) Train Loss: 0.1513, Train Steps/Sec: 11.48, Grad Norm: 0.0063 +[2025-02-20 01:24:47] (step=0408700) Train Loss: 0.1493, Train Steps/Sec: 11.48, Grad Norm: 0.0063 +[2025-02-20 01:24:56] (step=0408800) Train Loss: 0.1508, Train Steps/Sec: 11.47, Grad Norm: 0.0063 +[2025-02-20 01:25:05] (step=0408900) Train Loss: 0.1498, Train Steps/Sec: 11.48, Grad Norm: 0.0063 +[2025-02-20 01:25:14] (step=0409000) Train Loss: 0.1515, Train Steps/Sec: 11.50, Grad Norm: 0.0062 +[2025-02-20 01:25:24] (step=0409100) Train Loss: 0.1516, Train Steps/Sec: 9.87, Grad Norm: 0.0066 +[2025-02-20 01:25:32] (step=0409200) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 01:25:41] (step=0409300) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:25:50] (step=0409400) Train Loss: 0.1522, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:25:59] (step=0409500) Train Loss: 0.1490, Train Steps/Sec: 10.58, Grad Norm: 0.0067 +[2025-02-20 01:26:08] (step=0409600) Train Loss: 0.1503, Train Steps/Sec: 11.29, Grad Norm: 0.0065 +[2025-02-20 01:26:17] (step=0409700) Train Loss: 0.1505, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-20 01:26:26] (step=0409800) Train Loss: 0.1503, Train Steps/Sec: 11.31, Grad Norm: 0.0063 +[2025-02-20 01:26:34] (step=0409900) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:26:43] (step=0410000) Train Loss: 0.1504, Train Steps/Sec: 11.31, Grad Norm: 0.0064 +[2025-02-20 01:26:52] (step=0410100) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:27:01] (step=0410200) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:27:09] (step=0410300) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 01:27:19] (step=0410400) Train Loss: 0.1501, Train Steps/Sec: 10.10, Grad Norm: 0.0063 +[2025-02-20 01:27:28] (step=0410500) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:27:36] (step=0410600) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:27:45] (step=0410700) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 01:27:53] (step=0410800) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:28:02] (step=0410900) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:28:11] (step=0411000) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-20 01:28:19] (step=0411100) Train Loss: 0.1516, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:28:28] (step=0411200) Train Loss: 0.1512, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-20 01:28:36] (step=0411300) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:28:45] (step=0411400) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:28:53] (step=0411500) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 01:29:03] (step=0411600) Train Loss: 0.1503, Train Steps/Sec: 10.04, Grad Norm: 0.0062 +[2025-02-20 01:29:13] (step=0411700) Train Loss: 0.1510, Train Steps/Sec: 10.60, Grad Norm: 0.0064 +[2025-02-20 01:29:22] (step=0411800) Train Loss: 0.1501, Train Steps/Sec: 10.93, Grad Norm: 0.0063 +[2025-02-20 01:29:31] (step=0411900) Train Loss: 0.1511, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 01:29:40] (step=0412000) Train Loss: 0.1506, Train Steps/Sec: 11.31, Grad Norm: 0.0063 +[2025-02-20 01:29:48] (step=0412100) Train Loss: 0.1517, Train Steps/Sec: 11.67, Grad Norm: 0.0067 +[2025-02-20 01:29:57] (step=0412200) Train Loss: 0.1494, Train Steps/Sec: 11.33, Grad Norm: 0.0064 +[2025-02-20 01:30:06] (step=0412300) Train Loss: 0.1494, Train Steps/Sec: 11.67, Grad Norm: 0.0066 +[2025-02-20 01:30:14] (step=0412400) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0063 +[2025-02-20 01:30:23] (step=0412500) Train Loss: 0.1498, Train Steps/Sec: 11.67, Grad Norm: 0.0066 +[2025-02-20 01:30:31] (step=0412600) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 01:30:40] (step=0412700) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 01:30:49] (step=0412800) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 01:30:58] (step=0412900) Train Loss: 0.1514, Train Steps/Sec: 10.11, Grad Norm: 0.0066 +[2025-02-20 01:31:07] (step=0413000) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:31:16] (step=0413100) Train Loss: 0.1505, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 01:31:24] (step=0413200) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:31:33] (step=0413300) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 01:31:41] (step=0413400) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 01:31:50] (step=0413500) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 01:31:59] (step=0413600) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 01:32:07] (step=0413700) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 01:32:16] (step=0413800) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 01:32:25] (step=0413900) Train Loss: 0.1518, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-20 01:32:35] (step=0414000) Train Loss: 0.1508, Train Steps/Sec: 10.57, Grad Norm: 0.0063 +[2025-02-20 01:32:45] (step=0414100) Train Loss: 0.1495, Train Steps/Sec: 9.78, Grad Norm: 0.0062 +[2025-02-20 01:32:54] (step=0414200) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 01:33:02] (step=0414300) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:33:11] (step=0414400) Train Loss: 0.1500, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 01:33:20] (step=0414500) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:33:28] (step=0414600) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 01:33:37] (step=0414700) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:33:46] (step=0414800) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:33:54] (step=0414900) Train Loss: 0.1523, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:34:03] (step=0415000) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:34:11] (step=0415100) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:34:20] (step=0415200) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:34:29] (step=0415300) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 01:34:38] (step=0415400) Train Loss: 0.1513, Train Steps/Sec: 10.10, Grad Norm: 0.0062 +[2025-02-20 01:34:47] (step=0415500) Train Loss: 0.1520, Train Steps/Sec: 11.68, Grad Norm: 0.0064 +[2025-02-20 01:34:56] (step=0415600) Train Loss: 0.1500, Train Steps/Sec: 11.68, Grad Norm: 0.0065 +[2025-02-20 01:35:04] (step=0415700) Train Loss: 0.1498, Train Steps/Sec: 11.68, Grad Norm: 0.0065 +[2025-02-20 01:35:13] (step=0415800) Train Loss: 0.1508, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-20 01:35:21] (step=0415900) Train Loss: 0.1510, Train Steps/Sec: 11.68, Grad Norm: 0.0064 +[2025-02-20 01:35:30] (step=0416000) Train Loss: 0.1504, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-20 01:35:40] (step=0416100) Train Loss: 0.1508, Train Steps/Sec: 10.35, Grad Norm: 0.0064 +[2025-02-20 01:35:49] (step=0416200) Train Loss: 0.1506, Train Steps/Sec: 10.95, Grad Norm: 0.0065 +[2025-02-20 01:35:58] (step=0416300) Train Loss: 0.1497, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 01:36:06] (step=0416400) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 01:36:15] (step=0416500) Train Loss: 0.1512, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-20 01:36:25] (step=0416600) Train Loss: 0.1502, Train Steps/Sec: 10.05, Grad Norm: 0.0063 +[2025-02-20 01:36:33] (step=0416700) Train Loss: 0.1504, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-20 01:36:42] (step=0416800) Train Loss: 0.1501, Train Steps/Sec: 11.69, Grad Norm: 0.0063 +[2025-02-20 01:36:51] (step=0416900) Train Loss: 0.1516, Train Steps/Sec: 11.67, Grad Norm: 0.0064 +[2025-02-20 01:36:59] (step=0417000) Train Loss: 0.1517, Train Steps/Sec: 11.69, Grad Norm: 0.0064 +[2025-02-20 01:37:08] (step=0417100) Train Loss: 0.1504, Train Steps/Sec: 11.69, Grad Norm: 0.0067 +[2025-02-20 01:37:16] (step=0417200) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0066 +[2025-02-20 01:37:25] (step=0417300) Train Loss: 0.1499, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 01:37:33] (step=0417400) Train Loss: 0.1487, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-20 01:37:42] (step=0417500) Train Loss: 0.1506, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-20 01:37:51] (step=0417600) Train Loss: 0.1498, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-20 01:37:59] (step=0417700) Train Loss: 0.1494, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-20 01:38:08] (step=0417800) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0064 +[2025-02-20 01:38:18] (step=0417900) Train Loss: 0.1499, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-20 01:38:26] (step=0418000) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:38:35] (step=0418100) Train Loss: 0.1487, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:38:44] (step=0418200) Train Loss: 0.1503, Train Steps/Sec: 10.60, Grad Norm: 0.0066 +[2025-02-20 01:38:53] (step=0418300) Train Loss: 0.1512, Train Steps/Sec: 10.95, Grad Norm: 0.0063 +[2025-02-20 01:39:02] (step=0418400) Train Loss: 0.1506, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 01:39:11] (step=0418500) Train Loss: 0.1509, Train Steps/Sec: 11.32, Grad Norm: 0.0063 +[2025-02-20 01:39:20] (step=0418600) Train Loss: 0.1498, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 01:39:28] (step=0418700) Train Loss: 0.1509, Train Steps/Sec: 11.31, Grad Norm: 0.0064 +[2025-02-20 01:39:37] (step=0418800) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 01:39:46] (step=0418900) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 01:39:54] (step=0419000) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 01:40:04] (step=0419100) Train Loss: 0.1511, Train Steps/Sec: 10.06, Grad Norm: 0.0063 +[2025-02-20 01:40:13] (step=0419200) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:40:21] (step=0419300) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:40:30] (step=0419400) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:40:39] (step=0419500) Train Loss: 0.1523, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:40:47] (step=0419600) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:40:56] (step=0419700) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:41:04] (step=0419800) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 01:41:13] (step=0419900) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 01:41:21] (step=0420000) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:41:23] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0420000.pt +[2025-02-20 01:41:31] (step=0420100) Train Loss: 0.1500, Train Steps/Sec: 10.20, Grad Norm: 0.0063 +[2025-02-20 01:41:40] (step=0420200) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 01:41:48] (step=0420300) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:41:59] (step=0420400) Train Loss: 0.1507, Train Steps/Sec: 9.28, Grad Norm: 0.0064 +[2025-02-20 01:42:09] (step=0420500) Train Loss: 0.1503, Train Steps/Sec: 10.60, Grad Norm: 0.0064 +[2025-02-20 01:42:18] (step=0420600) Train Loss: 0.1502, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-20 01:42:26] (step=0420700) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 01:42:35] (step=0420800) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:42:44] (step=0420900) Train Loss: 0.1495, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-20 01:42:52] (step=0421000) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:43:01] (step=0421100) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 01:43:09] (step=0421200) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:43:18] (step=0421300) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0064 +[2025-02-20 01:43:26] (step=0421400) Train Loss: 0.1500, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 01:43:35] (step=0421500) Train Loss: 0.1515, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:43:45] (step=0421600) Train Loss: 0.1516, Train Steps/Sec: 9.95, Grad Norm: 0.0063 +[2025-02-20 01:43:54] (step=0421700) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-20 01:44:02] (step=0421800) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:44:11] (step=0421900) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:44:19] (step=0422000) Train Loss: 0.1492, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 01:44:28] (step=0422100) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:44:37] (step=0422200) Train Loss: 0.1511, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 01:44:45] (step=0422300) Train Loss: 0.1506, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 01:44:54] (step=0422400) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0063 +[2025-02-20 01:45:02] (step=0422500) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:45:12] (step=0422600) Train Loss: 0.1519, Train Steps/Sec: 10.33, Grad Norm: 0.0064 +[2025-02-20 01:45:21] (step=0422700) Train Loss: 0.1500, Train Steps/Sec: 10.94, Grad Norm: 0.0065 +[2025-02-20 01:45:30] (step=0422800) Train Loss: 0.1511, Train Steps/Sec: 11.32, Grad Norm: 0.0064 +[2025-02-20 01:45:40] (step=0422900) Train Loss: 0.1505, Train Steps/Sec: 10.07, Grad Norm: 0.0062 +[2025-02-20 01:45:49] (step=0423000) Train Loss: 0.1502, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-20 01:45:57] (step=0423100) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 01:46:06] (step=0423200) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:46:15] (step=0423300) Train Loss: 0.1513, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-20 01:46:23] (step=0423400) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 01:46:32] (step=0423500) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:46:40] (step=0423600) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:46:49] (step=0423700) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:46:57] (step=0423800) Train Loss: 0.1516, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:47:06] (step=0423900) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:47:15] (step=0424000) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-20 01:47:25] (step=0424100) Train Loss: 0.1511, Train Steps/Sec: 10.10, Grad Norm: 0.0062 +[2025-02-20 01:47:33] (step=0424200) Train Loss: 0.1505, Train Steps/Sec: 11.66, Grad Norm: 0.0064 +[2025-02-20 01:47:42] (step=0424300) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:47:50] (step=0424400) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 01:47:59] (step=0424500) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 01:48:07] (step=0424600) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0063 +[2025-02-20 01:48:16] (step=0424700) Train Loss: 0.1501, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 01:48:26] (step=0424800) Train Loss: 0.1499, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-20 01:48:35] (step=0424900) Train Loss: 0.1504, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-20 01:48:44] (step=0425000) Train Loss: 0.1498, Train Steps/Sec: 11.31, Grad Norm: 0.0063 +[2025-02-20 01:48:52] (step=0425100) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-20 01:49:01] (step=0425200) Train Loss: 0.1507, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 01:49:10] (step=0425300) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:49:20] (step=0425400) Train Loss: 0.1506, Train Steps/Sec: 10.05, Grad Norm: 0.0064 +[2025-02-20 01:49:28] (step=0425500) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 01:49:37] (step=0425600) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:49:45] (step=0425700) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:49:54] (step=0425800) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:50:03] (step=0425900) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 01:50:11] (step=0426000) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:50:20] (step=0426100) Train Loss: 0.1493, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 01:50:28] (step=0426200) Train Loss: 0.1509, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 01:50:37] (step=0426300) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-20 01:50:46] (step=0426400) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 01:50:54] (step=0426500) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 01:51:04] (step=0426600) Train Loss: 0.1508, Train Steps/Sec: 10.09, Grad Norm: 0.0061 +[2025-02-20 01:51:13] (step=0426700) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 01:51:21] (step=0426800) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 01:51:31] (step=0426900) Train Loss: 0.1501, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-20 01:51:40] (step=0427000) Train Loss: 0.1495, Train Steps/Sec: 10.93, Grad Norm: 0.0065 +[2025-02-20 01:51:49] (step=0427100) Train Loss: 0.1524, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-20 01:51:58] (step=0427200) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 01:52:06] (step=0427300) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 01:52:15] (step=0427400) Train Loss: 0.1505, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-20 01:52:24] (step=0427500) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:52:32] (step=0427600) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:52:41] (step=0427700) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:52:49] (step=0427800) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0066 +[2025-02-20 01:53:00] (step=0427900) Train Loss: 0.1501, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-20 01:53:08] (step=0428000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 01:53:17] (step=0428100) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:53:25] (step=0428200) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 01:53:34] (step=0428300) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:53:42] (step=0428400) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:53:51] (step=0428500) Train Loss: 0.1518, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 01:54:00] (step=0428600) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:54:08] (step=0428700) Train Loss: 0.1488, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:54:17] (step=0428800) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 01:54:25] (step=0428900) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 01:54:34] (step=0429000) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:54:45] (step=0429100) Train Loss: 0.1510, Train Steps/Sec: 9.22, Grad Norm: 0.0066 +[2025-02-20 01:54:54] (step=0429200) Train Loss: 0.1509, Train Steps/Sec: 10.56, Grad Norm: 0.0064 +[2025-02-20 01:55:03] (step=0429300) Train Loss: 0.1504, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 01:55:12] (step=0429400) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:55:20] (step=0429500) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 01:55:29] (step=0429600) Train Loss: 0.1493, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 01:55:38] (step=0429700) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:55:46] (step=0429800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 01:55:55] (step=0429900) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:56:04] (step=0430000) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 01:56:12] (step=0430100) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 01:56:21] (step=0430200) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 01:56:29] (step=0430300) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:56:39] (step=0430400) Train Loss: 0.1502, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 01:56:48] (step=0430500) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 01:56:57] (step=0430600) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 01:57:05] (step=0430700) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 01:57:14] (step=0430800) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 01:57:23] (step=0430900) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 01:57:31] (step=0431000) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 01:57:40] (step=0431100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 01:57:48] (step=0431200) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 01:57:58] (step=0431300) Train Loss: 0.1505, Train Steps/Sec: 10.32, Grad Norm: 0.0064 +[2025-02-20 01:58:07] (step=0431400) Train Loss: 0.1512, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-20 01:58:16] (step=0431500) Train Loss: 0.1491, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 01:58:26] (step=0431600) Train Loss: 0.1495, Train Steps/Sec: 10.04, Grad Norm: 0.0063 +[2025-02-20 01:58:35] (step=0431700) Train Loss: 0.1504, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 01:58:44] (step=0431800) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:58:52] (step=0431900) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0067 +[2025-02-20 01:59:01] (step=0432000) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:59:09] (step=0432100) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 01:59:18] (step=0432200) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 01:59:26] (step=0432300) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-20 01:59:35] (step=0432400) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 01:59:44] (step=0432500) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 01:59:52] (step=0432600) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 02:00:01] (step=0432700) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 02:00:09] (step=0432800) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 02:00:19] (step=0432900) Train Loss: 0.1497, Train Steps/Sec: 10.01, Grad Norm: 0.0064 +[2025-02-20 02:00:28] (step=0433000) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 02:00:37] (step=0433100) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 02:00:45] (step=0433200) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:00:54] (step=0433300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:01:03] (step=0433400) Train Loss: 0.1505, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-20 02:01:12] (step=0433500) Train Loss: 0.1498, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-20 02:01:22] (step=0433600) Train Loss: 0.1495, Train Steps/Sec: 10.58, Grad Norm: 0.0060 +[2025-02-20 02:01:30] (step=0433700) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:01:39] (step=0433800) Train Loss: 0.1484, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:01:48] (step=0433900) Train Loss: 0.1494, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 02:01:56] (step=0434000) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:02:06] (step=0434100) Train Loss: 0.1502, Train Steps/Sec: 10.02, Grad Norm: 0.0065 +[2025-02-20 02:02:15] (step=0434200) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:02:24] (step=0434300) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 02:02:32] (step=0434400) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:02:41] (step=0434500) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:02:50] (step=0434600) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:02:58] (step=0434700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:03:07] (step=0434800) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:03:15] (step=0434900) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 02:03:24] (step=0435000) Train Loss: 0.1521, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:03:33] (step=0435100) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:03:41] (step=0435200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:03:50] (step=0435300) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 02:04:00] (step=0435400) Train Loss: 0.1503, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-20 02:04:08] (step=0435500) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 02:04:18] (step=0435600) Train Loss: 0.1502, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-20 02:04:27] (step=0435700) Train Loss: 0.1517, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 02:04:36] (step=0435800) Train Loss: 0.1498, Train Steps/Sec: 10.94, Grad Norm: 0.0064 +[2025-02-20 02:04:45] (step=0435900) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 02:04:53] (step=0436000) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 02:05:02] (step=0436100) Train Loss: 0.1492, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 02:05:11] (step=0436200) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:05:19] (step=0436300) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-20 02:05:28] (step=0436400) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:05:37] (step=0436500) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 02:05:47] (step=0436600) Train Loss: 0.1505, Train Steps/Sec: 10.05, Grad Norm: 0.0063 +[2025-02-20 02:05:55] (step=0436700) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:06:04] (step=0436800) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0068 +[2025-02-20 02:06:12] (step=0436900) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:06:21] (step=0437000) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:06:30] (step=0437100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:06:38] (step=0437200) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:06:47] (step=0437300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:06:55] (step=0437400) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:07:04] (step=0437500) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:07:13] (step=0437600) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:07:21] (step=0437700) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 02:07:31] (step=0437800) Train Loss: 0.1508, Train Steps/Sec: 10.54, Grad Norm: 0.0066 +[2025-02-20 02:07:42] (step=0437900) Train Loss: 0.1505, Train Steps/Sec: 9.27, Grad Norm: 0.0063 +[2025-02-20 02:07:50] (step=0438000) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 02:07:59] (step=0438100) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:08:08] (step=0438200) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:08:17] (step=0438300) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 02:08:25] (step=0438400) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:08:34] (step=0438500) Train Loss: 0.1516, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:08:42] (step=0438600) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:08:51] (step=0438700) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 02:09:00] (step=0438800) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:09:08] (step=0438900) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:09:17] (step=0439000) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:09:25] (step=0439100) Train Loss: 0.1512, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 02:09:35] (step=0439200) Train Loss: 0.1518, Train Steps/Sec: 10.05, Grad Norm: 0.0063 +[2025-02-20 02:09:44] (step=0439300) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:09:53] (step=0439400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:10:01] (step=0439500) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:10:10] (step=0439600) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:10:18] (step=0439700) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:10:27] (step=0439800) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:10:36] (step=0439900) Train Loss: 0.1503, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 02:10:45] (step=0440000) Train Loss: 0.1506, Train Steps/Sec: 10.61, Grad Norm: 0.0063 +[2025-02-20 02:10:46] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0440000.pt +[2025-02-20 02:10:56] (step=0440100) Train Loss: 0.1507, Train Steps/Sec: 9.55, Grad Norm: 0.0063 +[2025-02-20 02:11:04] (step=0440200) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:11:13] (step=0440300) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:11:23] (step=0440400) Train Loss: 0.1504, Train Steps/Sec: 9.81, Grad Norm: 0.0061 +[2025-02-20 02:11:32] (step=0440500) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:11:40] (step=0440600) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:11:49] (step=0440700) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 02:11:58] (step=0440800) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 02:12:06] (step=0440900) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:12:15] (step=0441000) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 02:12:23] (step=0441100) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:12:32] (step=0441200) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 02:12:41] (step=0441300) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:12:49] (step=0441400) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 02:12:58] (step=0441500) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 02:13:06] (step=0441600) Train Loss: 0.1487, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 02:13:16] (step=0441700) Train Loss: 0.1494, Train Steps/Sec: 10.07, Grad Norm: 0.0064 +[2025-02-20 02:13:25] (step=0441800) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 02:13:34] (step=0441900) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 02:13:42] (step=0442000) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 02:13:51] (step=0442100) Train Loss: 0.1499, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-20 02:14:00] (step=0442200) Train Loss: 0.1498, Train Steps/Sec: 10.95, Grad Norm: 0.0065 +[2025-02-20 02:14:10] (step=0442300) Train Loss: 0.1493, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 02:14:18] (step=0442400) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:14:27] (step=0442500) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:14:36] (step=0442600) Train Loss: 0.1501, Train Steps/Sec: 11.27, Grad Norm: 0.0064 +[2025-02-20 02:14:44] (step=0442700) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 02:14:53] (step=0442800) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:15:03] (step=0442900) Train Loss: 0.1514, Train Steps/Sec: 10.02, Grad Norm: 0.0064 +[2025-02-20 02:15:12] (step=0443000) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 02:15:20] (step=0443100) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0067 +[2025-02-20 02:15:29] (step=0443200) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 02:15:37] (step=0443300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:15:46] (step=0443400) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:15:55] (step=0443500) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:16:03] (step=0443600) Train Loss: 0.1524, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:16:12] (step=0443700) Train Loss: 0.1515, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 02:16:20] (step=0443800) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 02:16:29] (step=0443900) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 02:16:37] (step=0444000) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 02:16:46] (step=0444100) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:16:56] (step=0444200) Train Loss: 0.1497, Train Steps/Sec: 10.15, Grad Norm: 0.0061 +[2025-02-20 02:17:05] (step=0444300) Train Loss: 0.1509, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 02:17:15] (step=0444400) Train Loss: 0.1505, Train Steps/Sec: 10.91, Grad Norm: 0.0066 +[2025-02-20 02:17:23] (step=0444500) Train Loss: 0.1499, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 02:17:32] (step=0444600) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:17:41] (step=0444700) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:17:50] (step=0444800) Train Loss: 0.1516, Train Steps/Sec: 11.29, Grad Norm: 0.0065 +[2025-02-20 02:17:58] (step=0444900) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:18:07] (step=0445000) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 02:18:15] (step=0445100) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:18:24] (step=0445200) Train Loss: 0.1524, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:18:33] (step=0445300) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:18:43] (step=0445400) Train Loss: 0.1510, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-20 02:18:51] (step=0445500) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 02:19:00] (step=0445600) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:19:08] (step=0445700) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:19:17] (step=0445800) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:19:26] (step=0445900) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:19:34] (step=0446000) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:19:43] (step=0446100) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:19:51] (step=0446200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:20:00] (step=0446300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:20:09] (step=0446400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:20:18] (step=0446500) Train Loss: 0.1512, Train Steps/Sec: 10.28, Grad Norm: 0.0062 +[2025-02-20 02:20:28] (step=0446600) Train Loss: 0.1500, Train Steps/Sec: 10.55, Grad Norm: 0.0062 +[2025-02-20 02:20:38] (step=0446700) Train Loss: 0.1506, Train Steps/Sec: 10.07, Grad Norm: 0.0066 +[2025-02-20 02:20:46] (step=0446800) Train Loss: 0.1485, Train Steps/Sec: 11.63, Grad Norm: 0.0069 +[2025-02-20 02:20:55] (step=0446900) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:21:04] (step=0447000) Train Loss: 0.1511, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 02:21:13] (step=0447100) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:21:21] (step=0447200) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:21:30] (step=0447300) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:21:38] (step=0447400) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0066 +[2025-02-20 02:21:47] (step=0447500) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:21:56] (step=0447600) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:22:04] (step=0447700) Train Loss: 0.1485, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:22:13] (step=0447800) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:22:23] (step=0447900) Train Loss: 0.1499, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-20 02:22:31] (step=0448000) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:22:40] (step=0448100) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:22:49] (step=0448200) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:22:57] (step=0448300) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:23:06] (step=0448400) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:23:14] (step=0448500) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:23:23] (step=0448600) Train Loss: 0.1495, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 02:23:33] (step=0448700) Train Loss: 0.1494, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-20 02:23:42] (step=0448800) Train Loss: 0.1510, Train Steps/Sec: 10.60, Grad Norm: 0.0063 +[2025-02-20 02:23:51] (step=0448900) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 02:23:59] (step=0449000) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:24:08] (step=0449100) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 02:24:18] (step=0449200) Train Loss: 0.1511, Train Steps/Sec: 9.83, Grad Norm: 0.0064 +[2025-02-20 02:24:27] (step=0449300) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 02:24:35] (step=0449400) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:24:44] (step=0449500) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:24:53] (step=0449600) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 02:25:01] (step=0449700) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 02:25:10] (step=0449800) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:25:18] (step=0449900) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 02:25:27] (step=0450000) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:25:36] (step=0450100) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:25:44] (step=0450200) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:25:53] (step=0450300) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 02:26:03] (step=0450400) Train Loss: 0.1499, Train Steps/Sec: 9.97, Grad Norm: 0.0063 +[2025-02-20 02:26:12] (step=0450500) Train Loss: 0.1491, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 02:26:20] (step=0450600) Train Loss: 0.1497, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 02:26:29] (step=0450700) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 02:26:38] (step=0450800) Train Loss: 0.1504, Train Steps/Sec: 10.54, Grad Norm: 0.0062 +[2025-02-20 02:26:48] (step=0450900) Train Loss: 0.1494, Train Steps/Sec: 10.59, Grad Norm: 0.0064 +[2025-02-20 02:26:57] (step=0451000) Train Loss: 0.1502, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 02:27:05] (step=0451100) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 02:27:14] (step=0451200) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 02:27:23] (step=0451300) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-20 02:27:31] (step=0451400) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:27:40] (step=0451500) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:27:49] (step=0451600) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:27:59] (step=0451700) Train Loss: 0.1508, Train Steps/Sec: 10.09, Grad Norm: 0.0062 +[2025-02-20 02:28:07] (step=0451800) Train Loss: 0.1488, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 02:28:16] (step=0451900) Train Loss: 0.1501, Train Steps/Sec: 11.66, Grad Norm: 0.0067 +[2025-02-20 02:28:24] (step=0452000) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:28:33] (step=0452100) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:28:42] (step=0452200) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:28:50] (step=0452300) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:28:59] (step=0452400) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:29:07] (step=0452500) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:29:16] (step=0452600) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 02:29:25] (step=0452700) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 02:29:33] (step=0452800) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 02:29:43] (step=0452900) Train Loss: 0.1510, Train Steps/Sec: 10.05, Grad Norm: 0.0062 +[2025-02-20 02:29:53] (step=0453000) Train Loss: 0.1508, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-20 02:30:02] (step=0453100) Train Loss: 0.1499, Train Steps/Sec: 10.28, Grad Norm: 0.0064 +[2025-02-20 02:30:11] (step=0453200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:30:19] (step=0453300) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 02:30:28] (step=0453400) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 02:30:37] (step=0453500) Train Loss: 0.1508, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 02:30:46] (step=0453600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:30:54] (step=0453700) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:31:03] (step=0453800) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:31:11] (step=0453900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:31:20] (step=0454000) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:31:29] (step=0454100) Train Loss: 0.1513, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:31:39] (step=0454200) Train Loss: 0.1515, Train Steps/Sec: 10.03, Grad Norm: 0.0066 +[2025-02-20 02:31:47] (step=0454300) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 02:31:56] (step=0454400) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:32:04] (step=0454500) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:32:13] (step=0454600) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:32:22] (step=0454700) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:32:30] (step=0454800) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:32:39] (step=0454900) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 02:32:48] (step=0455000) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:32:56] (step=0455100) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:33:06] (step=0455200) Train Loss: 0.1498, Train Steps/Sec: 10.26, Grad Norm: 0.0062 +[2025-02-20 02:33:15] (step=0455300) Train Loss: 0.1494, Train Steps/Sec: 10.58, Grad Norm: 0.0063 +[2025-02-20 02:33:25] (step=0455400) Train Loss: 0.1513, Train Steps/Sec: 9.99, Grad Norm: 0.0063 +[2025-02-20 02:33:34] (step=0455500) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 02:33:43] (step=0455600) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:33:51] (step=0455700) Train Loss: 0.1515, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 02:34:00] (step=0455800) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:34:09] (step=0455900) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:34:17] (step=0456000) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:34:26] (step=0456100) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:34:35] (step=0456200) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:34:43] (step=0456300) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:34:52] (step=0456400) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 02:35:00] (step=0456500) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:35:09] (step=0456600) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 02:35:19] (step=0456700) Train Loss: 0.1500, Train Steps/Sec: 10.08, Grad Norm: 0.0061 +[2025-02-20 02:35:28] (step=0456800) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:35:36] (step=0456900) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:35:45] (step=0457000) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 02:35:53] (step=0457100) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:36:02] (step=0457200) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:36:11] (step=0457300) Train Loss: 0.1506, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-20 02:36:21] (step=0457400) Train Loss: 0.1510, Train Steps/Sec: 10.30, Grad Norm: 0.0062 +[2025-02-20 02:36:30] (step=0457500) Train Loss: 0.1505, Train Steps/Sec: 10.88, Grad Norm: 0.0061 +[2025-02-20 02:36:39] (step=0457600) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:36:47] (step=0457700) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:36:56] (step=0457800) Train Loss: 0.1523, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 02:37:06] (step=0457900) Train Loss: 0.1513, Train Steps/Sec: 9.99, Grad Norm: 0.0064 +[2025-02-20 02:37:15] (step=0458000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 02:37:23] (step=0458100) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 02:37:32] (step=0458200) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:37:41] (step=0458300) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:37:49] (step=0458400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:37:58] (step=0458500) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:38:06] (step=0458600) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:38:15] (step=0458700) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:38:24] (step=0458800) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:38:32] (step=0458900) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:38:41] (step=0459000) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:38:49] (step=0459100) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 02:38:59] (step=0459200) Train Loss: 0.1513, Train Steps/Sec: 10.07, Grad Norm: 0.0065 +[2025-02-20 02:39:08] (step=0459300) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:39:17] (step=0459400) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:39:26] (step=0459500) Train Loss: 0.1494, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-20 02:39:35] (step=0459600) Train Loss: 0.1519, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-20 02:39:44] (step=0459700) Train Loss: 0.1499, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 02:39:53] (step=0459800) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 02:40:01] (step=0459900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 02:40:10] (step=0460000) Train Loss: 0.1507, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 02:40:11] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0460000.pt +[2025-02-20 02:40:20] (step=0460100) Train Loss: 0.1497, Train Steps/Sec: 10.65, Grad Norm: 0.0064 +[2025-02-20 02:40:28] (step=0460200) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 02:40:37] (step=0460300) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-20 02:40:47] (step=0460400) Train Loss: 0.1492, Train Steps/Sec: 10.05, Grad Norm: 0.0064 +[2025-02-20 02:40:55] (step=0460500) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:41:04] (step=0460600) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 02:41:13] (step=0460700) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:41:21] (step=0460800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:41:30] (step=0460900) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:41:38] (step=0461000) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:41:47] (step=0461100) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 02:41:56] (step=0461200) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:42:04] (step=0461300) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:42:13] (step=0461400) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:42:21] (step=0461500) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:42:30] (step=0461600) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:42:41] (step=0461700) Train Loss: 0.1489, Train Steps/Sec: 8.96, Grad Norm: 0.0064 +[2025-02-20 02:42:51] (step=0461800) Train Loss: 0.1504, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-20 02:42:59] (step=0461900) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:43:08] (step=0462000) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:43:17] (step=0462100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:43:25] (step=0462200) Train Loss: 0.1499, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 02:43:34] (step=0462300) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:43:43] (step=0462400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 02:43:51] (step=0462500) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:44:00] (step=0462600) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 02:44:08] (step=0462700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 02:44:17] (step=0462800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 02:44:27] (step=0462900) Train Loss: 0.1500, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-20 02:44:36] (step=0463000) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:44:44] (step=0463100) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 02:44:53] (step=0463200) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:45:02] (step=0463300) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:45:10] (step=0463400) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:45:19] (step=0463500) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:45:27] (step=0463600) Train Loss: 0.1481, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:45:36] (step=0463700) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 02:45:45] (step=0463800) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:45:55] (step=0463900) Train Loss: 0.1508, Train Steps/Sec: 9.98, Grad Norm: 0.0063 +[2025-02-20 02:46:04] (step=0464000) Train Loss: 0.1513, Train Steps/Sec: 10.87, Grad Norm: 0.0062 +[2025-02-20 02:46:12] (step=0464100) Train Loss: 0.1506, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 02:46:23] (step=0464200) Train Loss: 0.1510, Train Steps/Sec: 9.94, Grad Norm: 0.0062 +[2025-02-20 02:46:31] (step=0464300) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 02:46:40] (step=0464400) Train Loss: 0.1502, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 02:46:49] (step=0464500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:46:57] (step=0464600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:47:06] (step=0464700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 02:47:14] (step=0464800) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:47:23] (step=0464900) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:47:32] (step=0465000) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:47:40] (step=0465100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:47:49] (step=0465200) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 02:47:57] (step=0465300) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:48:07] (step=0465400) Train Loss: 0.1506, Train Steps/Sec: 10.01, Grad Norm: 0.0062 +[2025-02-20 02:48:16] (step=0465500) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:48:25] (step=0465600) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0067 +[2025-02-20 02:48:33] (step=0465700) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:48:42] (step=0465800) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 02:48:51] (step=0465900) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:48:59] (step=0466000) Train Loss: 0.1509, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 02:49:09] (step=0466100) Train Loss: 0.1489, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 02:49:18] (step=0466200) Train Loss: 0.1492, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 02:49:27] (step=0466300) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 02:49:36] (step=0466400) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 02:49:44] (step=0466500) Train Loss: 0.1505, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 02:49:53] (step=0466600) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 02:50:03] (step=0466700) Train Loss: 0.1493, Train Steps/Sec: 10.08, Grad Norm: 0.0061 +[2025-02-20 02:50:12] (step=0466800) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 02:50:20] (step=0466900) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 02:50:29] (step=0467000) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:50:38] (step=0467100) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 02:50:46] (step=0467200) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0066 +[2025-02-20 02:50:55] (step=0467300) Train Loss: 0.1509, Train Steps/Sec: 11.50, Grad Norm: 0.0062 +[2025-02-20 02:51:04] (step=0467400) Train Loss: 0.1504, Train Steps/Sec: 11.45, Grad Norm: 0.0064 +[2025-02-20 02:51:12] (step=0467500) Train Loss: 0.1515, Train Steps/Sec: 11.45, Grad Norm: 0.0062 +[2025-02-20 02:51:21] (step=0467600) Train Loss: 0.1486, Train Steps/Sec: 11.45, Grad Norm: 0.0061 +[2025-02-20 02:51:30] (step=0467700) Train Loss: 0.1499, Train Steps/Sec: 11.46, Grad Norm: 0.0062 +[2025-02-20 02:51:39] (step=0467800) Train Loss: 0.1518, Train Steps/Sec: 11.46, Grad Norm: 0.0065 +[2025-02-20 02:51:49] (step=0467900) Train Loss: 0.1510, Train Steps/Sec: 9.90, Grad Norm: 0.0062 +[2025-02-20 02:51:57] (step=0468000) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:52:06] (step=0468100) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 02:52:16] (step=0468200) Train Loss: 0.1520, Train Steps/Sec: 10.24, Grad Norm: 0.0062 +[2025-02-20 02:52:25] (step=0468300) Train Loss: 0.1510, Train Steps/Sec: 10.79, Grad Norm: 0.0062 +[2025-02-20 02:52:34] (step=0468400) Train Loss: 0.1491, Train Steps/Sec: 11.15, Grad Norm: 0.0062 +[2025-02-20 02:52:43] (step=0468500) Train Loss: 0.1501, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 02:52:51] (step=0468600) Train Loss: 0.1507, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 02:53:00] (step=0468700) Train Loss: 0.1500, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 02:53:09] (step=0468800) Train Loss: 0.1510, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 02:53:18] (step=0468900) Train Loss: 0.1505, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 02:53:26] (step=0469000) Train Loss: 0.1507, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 02:53:35] (step=0469100) Train Loss: 0.1516, Train Steps/Sec: 11.52, Grad Norm: 0.0062 +[2025-02-20 02:53:45] (step=0469200) Train Loss: 0.1510, Train Steps/Sec: 9.88, Grad Norm: 0.0064 +[2025-02-20 02:53:54] (step=0469300) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:54:02] (step=0469400) Train Loss: 0.1491, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:54:11] (step=0469500) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 02:54:20] (step=0469600) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 02:54:28] (step=0469700) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:54:37] (step=0469800) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:54:45] (step=0469900) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:54:54] (step=0470000) Train Loss: 0.1487, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 02:55:03] (step=0470100) Train Loss: 0.1520, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 02:55:11] (step=0470200) Train Loss: 0.1508, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 02:55:20] (step=0470300) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 02:55:31] (step=0470400) Train Loss: 0.1504, Train Steps/Sec: 9.02, Grad Norm: 0.0062 +[2025-02-20 02:55:40] (step=0470500) Train Loss: 0.1499, Train Steps/Sec: 10.54, Grad Norm: 0.0062 +[2025-02-20 02:55:49] (step=0470600) Train Loss: 0.1484, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 02:55:58] (step=0470700) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:56:06] (step=0470800) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 02:56:15] (step=0470900) Train Loss: 0.1507, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 02:56:24] (step=0471000) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 02:56:33] (step=0471100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:56:41] (step=0471200) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 02:56:50] (step=0471300) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:56:58] (step=0471400) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 02:57:07] (step=0471500) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:57:16] (step=0471600) Train Loss: 0.1484, Train Steps/Sec: 11.54, Grad Norm: 0.0063 +[2025-02-20 02:57:26] (step=0471700) Train Loss: 0.1500, Train Steps/Sec: 9.91, Grad Norm: 0.0061 +[2025-02-20 02:57:34] (step=0471800) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 02:57:43] (step=0471900) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 02:57:52] (step=0472000) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 02:58:00] (step=0472100) Train Loss: 0.1499, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 02:58:09] (step=0472200) Train Loss: 0.1514, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 02:58:18] (step=0472300) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 02:58:26] (step=0472400) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 02:58:35] (step=0472500) Train Loss: 0.1508, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 02:58:45] (step=0472600) Train Loss: 0.1493, Train Steps/Sec: 10.25, Grad Norm: 0.0063 +[2025-02-20 02:58:54] (step=0472700) Train Loss: 0.1491, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 02:59:03] (step=0472800) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 02:59:13] (step=0472900) Train Loss: 0.1489, Train Steps/Sec: 10.06, Grad Norm: 0.0063 +[2025-02-20 02:59:21] (step=0473000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 02:59:30] (step=0473100) Train Loss: 0.1515, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 02:59:39] (step=0473200) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 02:59:47] (step=0473300) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 02:59:56] (step=0473400) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:00:04] (step=0473500) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 03:00:13] (step=0473600) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 03:00:22] (step=0473700) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 03:00:30] (step=0473800) Train Loss: 0.1486, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 03:00:39] (step=0473900) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 03:00:47] (step=0474000) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 03:00:56] (step=0474100) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:01:06] (step=0474200) Train Loss: 0.1503, Train Steps/Sec: 10.09, Grad Norm: 0.0062 +[2025-02-20 03:01:15] (step=0474300) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 03:01:23] (step=0474400) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 03:01:32] (step=0474500) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 03:01:40] (step=0474600) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 03:01:50] (step=0474700) Train Loss: 0.1521, Train Steps/Sec: 10.30, Grad Norm: 0.0060 +[2025-02-20 03:01:59] (step=0474800) Train Loss: 0.1495, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-20 03:02:08] (step=0474900) Train Loss: 0.1505, Train Steps/Sec: 11.24, Grad Norm: 0.0068 +[2025-02-20 03:02:17] (step=0475000) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:02:25] (step=0475100) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 03:02:34] (step=0475200) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:02:43] (step=0475300) Train Loss: 0.1506, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 03:02:53] (step=0475400) Train Loss: 0.1500, Train Steps/Sec: 9.95, Grad Norm: 0.0060 +[2025-02-20 03:03:01] (step=0475500) Train Loss: 0.1485, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 03:03:10] (step=0475600) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:03:19] (step=0475700) Train Loss: 0.1518, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:03:27] (step=0475800) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 03:03:36] (step=0475900) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:03:45] (step=0476000) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 03:03:53] (step=0476100) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 03:04:02] (step=0476200) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:04:10] (step=0476300) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:04:19] (step=0476400) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:04:28] (step=0476500) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:04:36] (step=0476600) Train Loss: 0.1493, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 03:04:46] (step=0476700) Train Loss: 0.1517, Train Steps/Sec: 10.08, Grad Norm: 0.0062 +[2025-02-20 03:04:55] (step=0476800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:05:05] (step=0476900) Train Loss: 0.1487, Train Steps/Sec: 10.27, Grad Norm: 0.0060 +[2025-02-20 03:05:14] (step=0477000) Train Loss: 0.1505, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 03:05:23] (step=0477100) Train Loss: 0.1507, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 03:05:31] (step=0477200) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:05:40] (step=0477300) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:05:49] (step=0477400) Train Loss: 0.1506, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 03:05:57] (step=0477500) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:06:06] (step=0477600) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:06:15] (step=0477700) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:06:23] (step=0477800) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:06:33] (step=0477900) Train Loss: 0.1506, Train Steps/Sec: 10.00, Grad Norm: 0.0063 +[2025-02-20 03:06:42] (step=0478000) Train Loss: 0.1492, Train Steps/Sec: 11.54, Grad Norm: 0.0059 +[2025-02-20 03:06:51] (step=0478100) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:06:59] (step=0478200) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:07:08] (step=0478300) Train Loss: 0.1492, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:07:17] (step=0478400) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 03:07:25] (step=0478500) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 03:07:34] (step=0478600) Train Loss: 0.1516, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 03:07:42] (step=0478700) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:07:51] (step=0478800) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 03:08:00] (step=0478900) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:08:08] (step=0479000) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 03:08:18] (step=0479100) Train Loss: 0.1512, Train Steps/Sec: 10.24, Grad Norm: 0.0063 +[2025-02-20 03:08:29] (step=0479200) Train Loss: 0.1501, Train Steps/Sec: 9.20, Grad Norm: 0.0064 +[2025-02-20 03:08:38] (step=0479300) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:08:46] (step=0479400) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:08:55] (step=0479500) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:09:04] (step=0479600) Train Loss: 0.1513, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 03:09:12] (step=0479700) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:09:21] (step=0479800) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:09:30] (step=0479900) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 03:09:38] (step=0480000) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 03:09:39] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0480000.pt +[2025-02-20 03:09:48] (step=0480100) Train Loss: 0.1505, Train Steps/Sec: 10.04, Grad Norm: 0.0062 +[2025-02-20 03:09:57] (step=0480200) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:10:05] (step=0480300) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:10:15] (step=0480400) Train Loss: 0.1506, Train Steps/Sec: 9.93, Grad Norm: 0.0062 +[2025-02-20 03:10:24] (step=0480500) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:10:33] (step=0480600) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:10:41] (step=0480700) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:10:50] (step=0480800) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 03:10:59] (step=0480900) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:11:07] (step=0481000) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 03:11:16] (step=0481100) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:11:25] (step=0481200) Train Loss: 0.1510, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-20 03:11:34] (step=0481300) Train Loss: 0.1506, Train Steps/Sec: 10.57, Grad Norm: 0.0060 +[2025-02-20 03:11:44] (step=0481400) Train Loss: 0.1496, Train Steps/Sec: 10.89, Grad Norm: 0.0060 +[2025-02-20 03:11:52] (step=0481500) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:12:01] (step=0481600) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:12:11] (step=0481700) Train Loss: 0.1500, Train Steps/Sec: 9.96, Grad Norm: 0.0063 +[2025-02-20 03:12:20] (step=0481800) Train Loss: 0.1505, Train Steps/Sec: 11.25, Grad Norm: 0.0064 +[2025-02-20 03:12:28] (step=0481900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:12:37] (step=0482000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:12:46] (step=0482100) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:12:54] (step=0482200) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:13:03] (step=0482300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:13:11] (step=0482400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:13:20] (step=0482500) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 03:13:29] (step=0482600) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:13:37] (step=0482700) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:13:46] (step=0482800) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:13:56] (step=0482900) Train Loss: 0.1508, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-20 03:14:04] (step=0483000) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:14:13] (step=0483100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:14:22] (step=0483200) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 03:14:30] (step=0483300) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:14:40] (step=0483400) Train Loss: 0.1513, Train Steps/Sec: 10.29, Grad Norm: 0.0061 +[2025-02-20 03:14:49] (step=0483500) Train Loss: 0.1503, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 03:14:58] (step=0483600) Train Loss: 0.1514, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-20 03:15:07] (step=0483700) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:15:15] (step=0483800) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:15:24] (step=0483900) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:15:33] (step=0484000) Train Loss: 0.1493, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 03:15:41] (step=0484100) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 03:15:51] (step=0484200) Train Loss: 0.1501, Train Steps/Sec: 9.96, Grad Norm: 0.0062 +[2025-02-20 03:16:00] (step=0484300) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:16:09] (step=0484400) Train Loss: 0.1495, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 03:16:17] (step=0484500) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:16:26] (step=0484600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:16:35] (step=0484700) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:16:43] (step=0484800) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:16:52] (step=0484900) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:17:00] (step=0485000) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:17:09] (step=0485100) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 03:17:18] (step=0485200) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 03:17:26] (step=0485300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 03:17:36] (step=0485400) Train Loss: 0.1491, Train Steps/Sec: 10.02, Grad Norm: 0.0064 +[2025-02-20 03:17:45] (step=0485500) Train Loss: 0.1500, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 03:17:55] (step=0485600) Train Loss: 0.1512, Train Steps/Sec: 10.56, Grad Norm: 0.0064 +[2025-02-20 03:18:04] (step=0485700) Train Loss: 0.1503, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 03:18:13] (step=0485800) Train Loss: 0.1503, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 03:18:21] (step=0485900) Train Loss: 0.1518, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 03:18:30] (step=0486000) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:18:39] (step=0486100) Train Loss: 0.1500, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 03:18:47] (step=0486200) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:18:56] (step=0486300) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:19:05] (step=0486400) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:19:13] (step=0486500) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:19:22] (step=0486600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:19:32] (step=0486700) Train Loss: 0.1516, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 03:19:40] (step=0486800) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:19:49] (step=0486900) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:19:58] (step=0487000) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 03:20:06] (step=0487100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:20:15] (step=0487200) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:20:23] (step=0487300) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:20:32] (step=0487400) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:20:41] (step=0487500) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 03:20:49] (step=0487600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:20:58] (step=0487700) Train Loss: 0.1493, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 03:21:08] (step=0487800) Train Loss: 0.1486, Train Steps/Sec: 10.53, Grad Norm: 0.0060 +[2025-02-20 03:21:19] (step=0487900) Train Loss: 0.1502, Train Steps/Sec: 9.24, Grad Norm: 0.0063 +[2025-02-20 03:21:27] (step=0488000) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 03:21:36] (step=0488100) Train Loss: 0.1487, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:21:44] (step=0488200) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 03:21:53] (step=0488300) Train Loss: 0.1501, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 03:22:02] (step=0488400) Train Loss: 0.1513, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:22:11] (step=0488500) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 03:22:19] (step=0488600) Train Loss: 0.1487, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 03:22:28] (step=0488700) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 03:22:36] (step=0488800) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:22:45] (step=0488900) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:22:54] (step=0489000) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:23:02] (step=0489100) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:23:12] (step=0489200) Train Loss: 0.1512, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-20 03:23:21] (step=0489300) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:23:30] (step=0489400) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:23:38] (step=0489500) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:23:47] (step=0489600) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 03:23:56] (step=0489700) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:24:04] (step=0489800) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:24:14] (step=0489900) Train Loss: 0.1505, Train Steps/Sec: 10.60, Grad Norm: 0.0063 +[2025-02-20 03:24:23] (step=0490000) Train Loss: 0.1493, Train Steps/Sec: 10.86, Grad Norm: 0.0062 +[2025-02-20 03:24:32] (step=0490100) Train Loss: 0.1491, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 03:24:41] (step=0490200) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:24:49] (step=0490300) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:24:59] (step=0490400) Train Loss: 0.1509, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 03:25:08] (step=0490500) Train Loss: 0.1489, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-20 03:25:17] (step=0490600) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:25:25] (step=0490700) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:25:34] (step=0490800) Train Loss: 0.1479, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 03:25:43] (step=0490900) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:25:51] (step=0491000) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:26:00] (step=0491100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:26:09] (step=0491200) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:26:17] (step=0491300) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:26:26] (step=0491400) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 03:26:34] (step=0491500) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:26:43] (step=0491600) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 03:26:53] (step=0491700) Train Loss: 0.1504, Train Steps/Sec: 9.96, Grad Norm: 0.0063 +[2025-02-20 03:27:02] (step=0491800) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 03:27:10] (step=0491900) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:27:19] (step=0492000) Train Loss: 0.1498, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-20 03:27:29] (step=0492100) Train Loss: 0.1514, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-20 03:27:38] (step=0492200) Train Loss: 0.1496, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-20 03:27:47] (step=0492300) Train Loss: 0.1498, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 03:27:55] (step=0492400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 03:28:04] (step=0492500) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:28:13] (step=0492600) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:28:21] (step=0492700) Train Loss: 0.1507, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 03:28:30] (step=0492800) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:28:40] (step=0492900) Train Loss: 0.1511, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 03:28:49] (step=0493000) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 03:28:57] (step=0493100) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 03:29:06] (step=0493200) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:29:14] (step=0493300) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 03:29:23] (step=0493400) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:29:32] (step=0493500) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:29:40] (step=0493600) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:29:49] (step=0493700) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:29:58] (step=0493800) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 03:30:06] (step=0493900) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:30:15] (step=0494000) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:30:23] (step=0494100) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:30:34] (step=0494200) Train Loss: 0.1491, Train Steps/Sec: 9.70, Grad Norm: 0.0062 +[2025-02-20 03:30:43] (step=0494300) Train Loss: 0.1507, Train Steps/Sec: 10.54, Grad Norm: 0.0066 +[2025-02-20 03:30:53] (step=0494400) Train Loss: 0.1492, Train Steps/Sec: 10.54, Grad Norm: 0.0062 +[2025-02-20 03:31:01] (step=0494500) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:31:10] (step=0494600) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 03:31:19] (step=0494700) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 03:31:28] (step=0494800) Train Loss: 0.1503, Train Steps/Sec: 11.25, Grad Norm: 0.0064 +[2025-02-20 03:31:36] (step=0494900) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:31:45] (step=0495000) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:31:53] (step=0495100) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 03:32:02] (step=0495200) Train Loss: 0.1493, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:32:11] (step=0495300) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:32:21] (step=0495400) Train Loss: 0.1512, Train Steps/Sec: 9.97, Grad Norm: 0.0064 +[2025-02-20 03:32:29] (step=0495500) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:32:38] (step=0495600) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:32:47] (step=0495700) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:32:55] (step=0495800) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 03:33:04] (step=0495900) Train Loss: 0.1493, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 03:33:13] (step=0496000) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0067 +[2025-02-20 03:33:21] (step=0496100) Train Loss: 0.1493, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:33:30] (step=0496200) Train Loss: 0.1493, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:33:38] (step=0496300) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 03:33:47] (step=0496400) Train Loss: 0.1484, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 03:33:57] (step=0496500) Train Loss: 0.1504, Train Steps/Sec: 10.56, Grad Norm: 0.0061 +[2025-02-20 03:34:06] (step=0496600) Train Loss: 0.1484, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-20 03:34:16] (step=0496700) Train Loss: 0.1511, Train Steps/Sec: 9.95, Grad Norm: 0.0061 +[2025-02-20 03:34:25] (step=0496800) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:34:34] (step=0496900) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:34:42] (step=0497000) Train Loss: 0.1507, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 03:34:51] (step=0497100) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:35:00] (step=0497200) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:35:08] (step=0497300) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:35:17] (step=0497400) Train Loss: 0.1476, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 03:35:26] (step=0497500) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:35:34] (step=0497600) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 03:35:43] (step=0497700) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:35:51] (step=0497800) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:36:01] (step=0497900) Train Loss: 0.1509, Train Steps/Sec: 9.98, Grad Norm: 0.0061 +[2025-02-20 03:36:10] (step=0498000) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:36:19] (step=0498100) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 03:36:27] (step=0498200) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:36:36] (step=0498300) Train Loss: 0.1491, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:36:45] (step=0498400) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:36:53] (step=0498500) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 03:37:03] (step=0498600) Train Loss: 0.1500, Train Steps/Sec: 10.58, Grad Norm: 0.0064 +[2025-02-20 03:37:12] (step=0498700) Train Loss: 0.1509, Train Steps/Sec: 10.88, Grad Norm: 0.0061 +[2025-02-20 03:37:21] (step=0498800) Train Loss: 0.1494, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 03:37:30] (step=0498900) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:37:38] (step=0499000) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 03:37:47] (step=0499100) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:37:57] (step=0499200) Train Loss: 0.1503, Train Steps/Sec: 9.73, Grad Norm: 0.0061 +[2025-02-20 03:38:06] (step=0499300) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:38:14] (step=0499400) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:38:23] (step=0499500) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:38:32] (step=0499600) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:38:40] (step=0499700) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:38:49] (step=0499800) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:38:57] (step=0499900) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:39:06] (step=0500000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 03:39:07] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0500000.pt +[2025-02-20 03:39:16] (step=0500100) Train Loss: 0.1491, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-20 03:39:24] (step=0500200) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 03:39:33] (step=0500300) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 03:39:42] (step=0500400) Train Loss: 0.1508, Train Steps/Sec: 10.86, Grad Norm: 0.0064 +[2025-02-20 03:39:51] (step=0500500) Train Loss: 0.1509, Train Steps/Sec: 10.66, Grad Norm: 0.0069 +[2025-02-20 03:40:00] (step=0500600) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:40:09] (step=0500700) Train Loss: 0.1497, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 03:40:18] (step=0500800) Train Loss: 0.1508, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-20 03:40:27] (step=0500900) Train Loss: 0.1508, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 03:40:36] (step=0501000) Train Loss: 0.1488, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 03:40:45] (step=0501100) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 03:40:54] (step=0501200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 03:41:02] (step=0501300) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:41:11] (step=0501400) Train Loss: 0.1517, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 03:41:20] (step=0501500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:41:28] (step=0501600) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:41:38] (step=0501700) Train Loss: 0.1507, Train Steps/Sec: 9.97, Grad Norm: 0.0063 +[2025-02-20 03:41:47] (step=0501800) Train Loss: 0.1491, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:41:56] (step=0501900) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:42:04] (step=0502000) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:42:13] (step=0502100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:42:21] (step=0502200) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 03:42:30] (step=0502300) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 03:42:39] (step=0502400) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:42:47] (step=0502500) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:42:56] (step=0502600) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:43:05] (step=0502700) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:43:13] (step=0502800) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:43:22] (step=0502900) Train Loss: 0.1484, Train Steps/Sec: 11.20, Grad Norm: 0.0063 +[2025-02-20 03:43:33] (step=0503000) Train Loss: 0.1495, Train Steps/Sec: 9.23, Grad Norm: 0.0065 +[2025-02-20 03:43:42] (step=0503100) Train Loss: 0.1486, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 03:43:51] (step=0503200) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:44:00] (step=0503300) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:44:08] (step=0503400) Train Loss: 0.1485, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:44:17] (step=0503500) Train Loss: 0.1506, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 03:44:26] (step=0503600) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:44:34] (step=0503700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 03:44:43] (step=0503800) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:44:52] (step=0503900) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:45:00] (step=0504000) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:45:09] (step=0504100) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:45:19] (step=0504200) Train Loss: 0.1493, Train Steps/Sec: 9.85, Grad Norm: 0.0068 +[2025-02-20 03:45:28] (step=0504300) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 03:45:36] (step=0504400) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:45:45] (step=0504500) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:45:54] (step=0504600) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:46:02] (step=0504700) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 03:46:11] (step=0504800) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 03:46:19] (step=0504900) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 03:46:28] (step=0505000) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:46:37] (step=0505100) Train Loss: 0.1514, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 03:46:47] (step=0505200) Train Loss: 0.1501, Train Steps/Sec: 10.26, Grad Norm: 0.0061 +[2025-02-20 03:46:56] (step=0505300) Train Loss: 0.1499, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-20 03:47:04] (step=0505400) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:47:14] (step=0505500) Train Loss: 0.1512, Train Steps/Sec: 10.04, Grad Norm: 0.0061 +[2025-02-20 03:47:23] (step=0505600) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 03:47:32] (step=0505700) Train Loss: 0.1496, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 03:47:40] (step=0505800) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:47:49] (step=0505900) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:47:58] (step=0506000) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 03:48:06] (step=0506100) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:48:15] (step=0506200) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 03:48:23] (step=0506300) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 03:48:32] (step=0506400) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 03:48:41] (step=0506500) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:48:49] (step=0506600) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:48:59] (step=0506700) Train Loss: 0.1496, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-20 03:49:08] (step=0506800) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 03:49:17] (step=0506900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:49:25] (step=0507000) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:49:34] (step=0507100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 03:49:43] (step=0507200) Train Loss: 0.1505, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 03:49:52] (step=0507300) Train Loss: 0.1510, Train Steps/Sec: 10.90, Grad Norm: 0.0059 +[2025-02-20 03:50:01] (step=0507400) Train Loss: 0.1508, Train Steps/Sec: 10.88, Grad Norm: 0.0061 +[2025-02-20 03:50:10] (step=0507500) Train Loss: 0.1506, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-20 03:50:19] (step=0507600) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:50:27] (step=0507700) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:50:36] (step=0507800) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:50:45] (step=0507900) Train Loss: 0.1500, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 03:50:55] (step=0508000) Train Loss: 0.1504, Train Steps/Sec: 10.05, Grad Norm: 0.0060 +[2025-02-20 03:51:03] (step=0508100) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 03:51:12] (step=0508200) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:51:21] (step=0508300) Train Loss: 0.1515, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 03:51:29] (step=0508400) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 03:51:38] (step=0508500) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:51:47] (step=0508600) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:51:55] (step=0508700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 03:52:04] (step=0508800) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:52:12] (step=0508900) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:52:21] (step=0509000) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0067 +[2025-02-20 03:52:30] (step=0509100) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 03:52:40] (step=0509200) Train Loss: 0.1511, Train Steps/Sec: 9.97, Grad Norm: 0.0062 +[2025-02-20 03:52:48] (step=0509300) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:52:57] (step=0509400) Train Loss: 0.1488, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 03:53:07] (step=0509500) Train Loss: 0.1509, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-20 03:53:16] (step=0509600) Train Loss: 0.1509, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 03:53:25] (step=0509700) Train Loss: 0.1499, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 03:53:33] (step=0509800) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:53:42] (step=0509900) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 03:53:51] (step=0510000) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 03:53:59] (step=0510100) Train Loss: 0.1499, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 03:54:08] (step=0510200) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 03:54:17] (step=0510300) Train Loss: 0.1484, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 03:54:25] (step=0510400) Train Loss: 0.1499, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 03:54:35] (step=0510500) Train Loss: 0.1513, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 03:54:44] (step=0510600) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0067 +[2025-02-20 03:54:53] (step=0510700) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0065 +[2025-02-20 03:55:01] (step=0510800) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 03:55:10] (step=0510900) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:55:19] (step=0511000) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 03:55:27] (step=0511100) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:55:36] (step=0511200) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:55:44] (step=0511300) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:55:53] (step=0511400) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 03:56:02] (step=0511500) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 03:56:11] (step=0511600) Train Loss: 0.1501, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-20 03:56:22] (step=0511700) Train Loss: 0.1511, Train Steps/Sec: 8.99, Grad Norm: 0.0063 +[2025-02-20 03:56:31] (step=0511800) Train Loss: 0.1492, Train Steps/Sec: 10.88, Grad Norm: 0.0063 +[2025-02-20 03:56:40] (step=0511900) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 03:56:48] (step=0512000) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 03:56:57] (step=0512100) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:57:06] (step=0512200) Train Loss: 0.1519, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 03:57:14] (step=0512300) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 03:57:23] (step=0512400) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 03:57:32] (step=0512500) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 03:57:40] (step=0512600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 03:57:49] (step=0512700) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:57:57] (step=0512800) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:58:06] (step=0512900) Train Loss: 0.1514, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 03:58:16] (step=0513000) Train Loss: 0.1496, Train Steps/Sec: 9.94, Grad Norm: 0.0063 +[2025-02-20 03:58:25] (step=0513100) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 03:58:33] (step=0513200) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 03:58:42] (step=0513300) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:58:51] (step=0513400) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 03:58:59] (step=0513500) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 03:59:08] (step=0513600) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 03:59:17] (step=0513700) Train Loss: 0.1497, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 03:59:26] (step=0513800) Train Loss: 0.1503, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-20 03:59:35] (step=0513900) Train Loss: 0.1509, Train Steps/Sec: 10.53, Grad Norm: 0.0062 +[2025-02-20 03:59:44] (step=0514000) Train Loss: 0.1511, Train Steps/Sec: 10.90, Grad Norm: 0.0065 +[2025-02-20 03:59:53] (step=0514100) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:00:03] (step=0514200) Train Loss: 0.1503, Train Steps/Sec: 9.97, Grad Norm: 0.0066 +[2025-02-20 04:00:12] (step=0514300) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 04:00:20] (step=0514400) Train Loss: 0.1500, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 04:00:29] (step=0514500) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 04:00:38] (step=0514600) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:00:46] (step=0514700) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:00:55] (step=0514800) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 04:01:03] (step=0514900) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:01:12] (step=0515000) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:01:21] (step=0515100) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:01:29] (step=0515200) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 04:01:38] (step=0515300) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:01:47] (step=0515400) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:01:57] (step=0515500) Train Loss: 0.1508, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-20 04:02:05] (step=0515600) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:02:14] (step=0515700) Train Loss: 0.1513, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 04:02:22] (step=0515800) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:02:31] (step=0515900) Train Loss: 0.1514, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 04:02:41] (step=0516000) Train Loss: 0.1502, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 04:02:50] (step=0516100) Train Loss: 0.1505, Train Steps/Sec: 10.56, Grad Norm: 0.0059 +[2025-02-20 04:02:59] (step=0516200) Train Loss: 0.1501, Train Steps/Sec: 11.19, Grad Norm: 0.0064 +[2025-02-20 04:03:08] (step=0516300) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 04:03:16] (step=0516400) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:03:25] (step=0516500) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 04:03:34] (step=0516600) Train Loss: 0.1513, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 04:03:44] (step=0516700) Train Loss: 0.1509, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 04:03:52] (step=0516800) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 04:04:01] (step=0516900) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 04:04:10] (step=0517000) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 04:04:18] (step=0517100) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:04:27] (step=0517200) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:04:36] (step=0517300) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:04:44] (step=0517400) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:04:53] (step=0517500) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:05:01] (step=0517600) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:05:10] (step=0517700) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:05:19] (step=0517800) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 04:05:27] (step=0517900) Train Loss: 0.1496, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:05:37] (step=0518000) Train Loss: 0.1508, Train Steps/Sec: 10.04, Grad Norm: 0.0064 +[2025-02-20 04:05:46] (step=0518100) Train Loss: 0.1503, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 04:05:56] (step=0518200) Train Loss: 0.1500, Train Steps/Sec: 10.56, Grad Norm: 0.0060 +[2025-02-20 04:06:05] (step=0518300) Train Loss: 0.1495, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-20 04:06:14] (step=0518400) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:06:22] (step=0518500) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 04:06:31] (step=0518600) Train Loss: 0.1481, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 04:06:40] (step=0518700) Train Loss: 0.1498, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 04:06:48] (step=0518800) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:06:57] (step=0518900) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:07:06] (step=0519000) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 04:07:14] (step=0519100) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:07:24] (step=0519200) Train Loss: 0.1510, Train Steps/Sec: 9.95, Grad Norm: 0.0061 +[2025-02-20 04:07:33] (step=0519300) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:07:42] (step=0519400) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:07:50] (step=0519500) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 04:07:59] (step=0519600) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:08:08] (step=0519700) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:08:16] (step=0519800) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 04:08:25] (step=0519900) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:08:33] (step=0520000) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:08:35] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0520000.pt +[2025-02-20 04:08:43] (step=0520100) Train Loss: 0.1511, Train Steps/Sec: 10.20, Grad Norm: 0.0063 +[2025-02-20 04:08:52] (step=0520200) Train Loss: 0.1496, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 04:09:01] (step=0520300) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:09:10] (step=0520400) Train Loss: 0.1502, Train Steps/Sec: 10.53, Grad Norm: 0.0060 +[2025-02-20 04:09:21] (step=0520500) Train Loss: 0.1495, Train Steps/Sec: 9.25, Grad Norm: 0.0063 +[2025-02-20 04:09:30] (step=0520600) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:09:38] (step=0520700) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 04:09:47] (step=0520800) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:09:56] (step=0520900) Train Loss: 0.1501, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 04:10:04] (step=0521000) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:10:13] (step=0521100) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:10:22] (step=0521200) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:10:30] (step=0521300) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 04:10:39] (step=0521400) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:10:47] (step=0521500) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-20 04:10:56] (step=0521600) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 04:11:06] (step=0521700) Train Loss: 0.1508, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-20 04:11:15] (step=0521800) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:11:23] (step=0521900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:11:32] (step=0522000) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 04:11:40] (step=0522100) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 04:11:49] (step=0522200) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:11:58] (step=0522300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:12:07] (step=0522400) Train Loss: 0.1504, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 04:12:15] (step=0522500) Train Loss: 0.1498, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 04:12:25] (step=0522600) Train Loss: 0.1495, Train Steps/Sec: 10.54, Grad Norm: 0.0060 +[2025-02-20 04:12:34] (step=0522700) Train Loss: 0.1500, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 04:12:43] (step=0522800) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 04:12:51] (step=0522900) Train Loss: 0.1497, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:13:01] (step=0523000) Train Loss: 0.1510, Train Steps/Sec: 10.06, Grad Norm: 0.0065 +[2025-02-20 04:13:10] (step=0523100) Train Loss: 0.1508, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 04:13:19] (step=0523200) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:13:27] (step=0523300) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 04:13:36] (step=0523400) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:13:45] (step=0523500) Train Loss: 0.1515, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:13:53] (step=0523600) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 04:14:02] (step=0523700) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:14:11] (step=0523800) Train Loss: 0.1475, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 04:14:19] (step=0523900) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 04:14:28] (step=0524000) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:14:36] (step=0524100) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:14:46] (step=0524200) Train Loss: 0.1512, Train Steps/Sec: 9.95, Grad Norm: 0.0064 +[2025-02-20 04:14:55] (step=0524300) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:15:04] (step=0524400) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:15:13] (step=0524500) Train Loss: 0.1495, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 04:15:21] (step=0524600) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:15:30] (step=0524700) Train Loss: 0.1506, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 04:15:40] (step=0524800) Train Loss: 0.1512, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-20 04:15:49] (step=0524900) Train Loss: 0.1503, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 04:15:57] (step=0525000) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:16:06] (step=0525100) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:16:15] (step=0525200) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0068 +[2025-02-20 04:16:23] (step=0525300) Train Loss: 0.1503, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 04:16:32] (step=0525400) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 04:16:42] (step=0525500) Train Loss: 0.1503, Train Steps/Sec: 10.04, Grad Norm: 0.0061 +[2025-02-20 04:16:51] (step=0525600) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:16:59] (step=0525700) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:17:08] (step=0525800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 04:17:17] (step=0525900) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:17:25] (step=0526000) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:17:34] (step=0526100) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:17:42] (step=0526200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:17:51] (step=0526300) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:18:00] (step=0526400) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:18:08] (step=0526500) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:18:17] (step=0526600) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:18:27] (step=0526700) Train Loss: 0.1503, Train Steps/Sec: 9.77, Grad Norm: 0.0063 +[2025-02-20 04:18:36] (step=0526800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 04:18:45] (step=0526900) Train Loss: 0.1488, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-20 04:18:55] (step=0527000) Train Loss: 0.1498, Train Steps/Sec: 10.58, Grad Norm: 0.0063 +[2025-02-20 04:19:03] (step=0527100) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:19:12] (step=0527200) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:19:20] (step=0527300) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 04:19:29] (step=0527400) Train Loss: 0.1507, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 04:19:38] (step=0527500) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:19:47] (step=0527600) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:19:55] (step=0527700) Train Loss: 0.1511, Train Steps/Sec: 11.43, Grad Norm: 0.0062 +[2025-02-20 04:20:04] (step=0527800) Train Loss: 0.1503, Train Steps/Sec: 11.44, Grad Norm: 0.0062 +[2025-02-20 04:20:13] (step=0527900) Train Loss: 0.1494, Train Steps/Sec: 11.39, Grad Norm: 0.0061 +[2025-02-20 04:20:23] (step=0528000) Train Loss: 0.1503, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-20 04:20:31] (step=0528100) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:20:40] (step=0528200) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 04:20:49] (step=0528300) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 04:20:57] (step=0528400) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 04:21:06] (step=0528500) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 04:21:15] (step=0528600) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:21:23] (step=0528700) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 04:21:32] (step=0528800) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 04:21:41] (step=0528900) Train Loss: 0.1495, Train Steps/Sec: 11.21, Grad Norm: 0.0062 +[2025-02-20 04:21:49] (step=0529000) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:21:59] (step=0529100) Train Loss: 0.1501, Train Steps/Sec: 10.57, Grad Norm: 0.0060 +[2025-02-20 04:22:10] (step=0529200) Train Loss: 0.1504, Train Steps/Sec: 8.97, Grad Norm: 0.0060 +[2025-02-20 04:22:19] (step=0529300) Train Loss: 0.1515, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 04:22:27] (step=0529400) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:22:36] (step=0529500) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:22:45] (step=0529600) Train Loss: 0.1502, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-20 04:22:54] (step=0529700) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:23:02] (step=0529800) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:23:11] (step=0529900) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:23:19] (step=0530000) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:23:28] (step=0530100) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:23:37] (step=0530200) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 04:23:45] (step=0530300) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:23:54] (step=0530400) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 04:24:04] (step=0530500) Train Loss: 0.1509, Train Steps/Sec: 9.81, Grad Norm: 0.0062 +[2025-02-20 04:24:13] (step=0530600) Train Loss: 0.1493, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 04:24:21] (step=0530700) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:24:30] (step=0530800) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:24:39] (step=0530900) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:24:47] (step=0531000) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:24:56] (step=0531100) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 04:25:05] (step=0531200) Train Loss: 0.1515, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 04:25:14] (step=0531300) Train Loss: 0.1515, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 04:25:24] (step=0531400) Train Loss: 0.1505, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 04:25:32] (step=0531500) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:25:41] (step=0531600) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 04:25:51] (step=0531700) Train Loss: 0.1499, Train Steps/Sec: 9.77, Grad Norm: 0.0059 +[2025-02-20 04:26:00] (step=0531800) Train Loss: 0.1512, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 04:26:09] (step=0531900) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 04:26:17] (step=0532000) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 04:26:26] (step=0532100) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 04:26:34] (step=0532200) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 04:26:43] (step=0532300) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 04:26:52] (step=0532400) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:27:00] (step=0532500) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 04:27:09] (step=0532600) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:27:18] (step=0532700) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 04:27:26] (step=0532800) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:27:35] (step=0532900) Train Loss: 0.1501, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 04:27:45] (step=0533000) Train Loss: 0.1495, Train Steps/Sec: 9.92, Grad Norm: 0.0061 +[2025-02-20 04:27:54] (step=0533100) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:28:02] (step=0533200) Train Loss: 0.1501, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 04:28:11] (step=0533300) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:28:20] (step=0533400) Train Loss: 0.1506, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-20 04:28:30] (step=0533500) Train Loss: 0.1493, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 04:28:39] (step=0533600) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:28:47] (step=0533700) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:28:56] (step=0533800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:29:04] (step=0533900) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 04:29:13] (step=0534000) Train Loss: 0.1498, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 04:29:22] (step=0534100) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 04:29:32] (step=0534200) Train Loss: 0.1501, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-20 04:29:41] (step=0534300) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 04:29:49] (step=0534400) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:29:58] (step=0534500) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:30:06] (step=0534600) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 04:30:15] (step=0534700) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:30:24] (step=0534800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 04:30:32] (step=0534900) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:30:41] (step=0535000) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:30:49] (step=0535100) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:30:58] (step=0535200) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 04:31:07] (step=0535300) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:31:16] (step=0535400) Train Loss: 0.1516, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 04:31:26] (step=0535500) Train Loss: 0.1517, Train Steps/Sec: 10.03, Grad Norm: 0.0064 +[2025-02-20 04:31:35] (step=0535600) Train Loss: 0.1501, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-20 04:31:44] (step=0535700) Train Loss: 0.1504, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 04:31:53] (step=0535800) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:32:02] (step=0535900) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 04:32:10] (step=0536000) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:32:19] (step=0536100) Train Loss: 0.1508, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 04:32:28] (step=0536200) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 04:32:37] (step=0536300) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:32:45] (step=0536400) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 04:32:54] (step=0536500) Train Loss: 0.1499, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 04:33:02] (step=0536600) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 04:33:13] (step=0536700) Train Loss: 0.1507, Train Steps/Sec: 9.92, Grad Norm: 0.0060 +[2025-02-20 04:33:21] (step=0536800) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 04:33:30] (step=0536900) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 04:33:38] (step=0537000) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 04:33:47] (step=0537100) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 04:33:56] (step=0537200) Train Loss: 0.1518, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 04:34:04] (step=0537300) Train Loss: 0.1497, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 04:34:13] (step=0537400) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:34:22] (step=0537500) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:34:31] (step=0537600) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 04:34:39] (step=0537700) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:34:49] (step=0537800) Train Loss: 0.1500, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-20 04:34:58] (step=0537900) Train Loss: 0.1510, Train Steps/Sec: 10.52, Grad Norm: 0.0063 +[2025-02-20 04:35:08] (step=0538000) Train Loss: 0.1511, Train Steps/Sec: 10.01, Grad Norm: 0.0067 +[2025-02-20 04:35:17] (step=0538100) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:35:25] (step=0538200) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 04:35:34] (step=0538300) Train Loss: 0.1513, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 04:35:43] (step=0538400) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 04:35:51] (step=0538500) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:36:00] (step=0538600) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:36:09] (step=0538700) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:36:17] (step=0538800) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:36:26] (step=0538900) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:36:35] (step=0539000) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:36:43] (step=0539100) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 04:36:53] (step=0539200) Train Loss: 0.1500, Train Steps/Sec: 9.98, Grad Norm: 0.0064 +[2025-02-20 04:37:02] (step=0539300) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 04:37:11] (step=0539400) Train Loss: 0.1505, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 04:37:19] (step=0539500) Train Loss: 0.1506, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 04:37:28] (step=0539600) Train Loss: 0.1508, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 04:37:37] (step=0539700) Train Loss: 0.1513, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 04:37:45] (step=0539800) Train Loss: 0.1506, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 04:37:54] (step=0539900) Train Loss: 0.1504, Train Steps/Sec: 11.23, Grad Norm: 0.0064 +[2025-02-20 04:38:04] (step=0540000) Train Loss: 0.1486, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 04:38:05] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0540000.pt +[2025-02-20 04:38:14] (step=0540100) Train Loss: 0.1519, Train Steps/Sec: 9.80, Grad Norm: 0.0061 +[2025-02-20 04:38:23] (step=0540200) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:38:31] (step=0540300) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:38:40] (step=0540400) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 04:38:50] (step=0540500) Train Loss: 0.1495, Train Steps/Sec: 9.69, Grad Norm: 0.0062 +[2025-02-20 04:38:59] (step=0540600) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 04:39:07] (step=0540700) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 04:39:16] (step=0540800) Train Loss: 0.1504, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 04:39:25] (step=0540900) Train Loss: 0.1490, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:39:33] (step=0541000) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 04:39:42] (step=0541100) Train Loss: 0.1504, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:39:51] (step=0541200) Train Loss: 0.1494, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:39:59] (step=0541300) Train Loss: 0.1493, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:40:08] (step=0541400) Train Loss: 0.1489, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 04:40:17] (step=0541500) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 04:40:25] (step=0541600) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 04:40:35] (step=0541700) Train Loss: 0.1506, Train Steps/Sec: 9.89, Grad Norm: 0.0061 +[2025-02-20 04:40:44] (step=0541800) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:40:53] (step=0541900) Train Loss: 0.1502, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 04:41:01] (step=0542000) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:41:11] (step=0542100) Train Loss: 0.1521, Train Steps/Sec: 10.58, Grad Norm: 0.0061 +[2025-02-20 04:41:20] (step=0542200) Train Loss: 0.1496, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-20 04:41:29] (step=0542300) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 04:41:38] (step=0542400) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:41:46] (step=0542500) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:41:55] (step=0542600) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:42:04] (step=0542700) Train Loss: 0.1508, Train Steps/Sec: 11.20, Grad Norm: 0.0060 +[2025-02-20 04:42:12] (step=0542800) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:42:21] (step=0542900) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:42:31] (step=0543000) Train Loss: 0.1492, Train Steps/Sec: 9.94, Grad Norm: 0.0060 +[2025-02-20 04:42:40] (step=0543100) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:42:48] (step=0543200) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:42:57] (step=0543300) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:43:06] (step=0543400) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:43:14] (step=0543500) Train Loss: 0.1480, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:43:23] (step=0543600) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 04:43:32] (step=0543700) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:43:40] (step=0543800) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 04:43:49] (step=0543900) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:43:57] (step=0544000) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:44:06] (step=0544100) Train Loss: 0.1499, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 04:44:16] (step=0544200) Train Loss: 0.1502, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-20 04:44:26] (step=0544300) Train Loss: 0.1490, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-20 04:44:35] (step=0544400) Train Loss: 0.1507, Train Steps/Sec: 10.55, Grad Norm: 0.0062 +[2025-02-20 04:44:44] (step=0544500) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:44:53] (step=0544600) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:45:01] (step=0544700) Train Loss: 0.1515, Train Steps/Sec: 11.59, Grad Norm: 0.0065 +[2025-02-20 04:45:10] (step=0544800) Train Loss: 0.1495, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 04:45:19] (step=0544900) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:45:27] (step=0545000) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:45:36] (step=0545100) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 04:45:45] (step=0545200) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:45:53] (step=0545300) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:46:02] (step=0545400) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:46:12] (step=0545500) Train Loss: 0.1507, Train Steps/Sec: 9.92, Grad Norm: 0.0062 +[2025-02-20 04:46:21] (step=0545600) Train Loss: 0.1487, Train Steps/Sec: 11.57, Grad Norm: 0.0068 +[2025-02-20 04:46:29] (step=0545700) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:46:38] (step=0545800) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 04:46:47] (step=0545900) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 04:46:55] (step=0546000) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 04:47:04] (step=0546100) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 04:47:13] (step=0546200) Train Loss: 0.1494, Train Steps/Sec: 11.25, Grad Norm: 0.0065 +[2025-02-20 04:47:21] (step=0546300) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:47:30] (step=0546400) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:47:40] (step=0546500) Train Loss: 0.1506, Train Steps/Sec: 10.27, Grad Norm: 0.0063 +[2025-02-20 04:47:49] (step=0546600) Train Loss: 0.1497, Train Steps/Sec: 10.89, Grad Norm: 0.0060 +[2025-02-20 04:47:59] (step=0546700) Train Loss: 0.1491, Train Steps/Sec: 10.01, Grad Norm: 0.0059 +[2025-02-20 04:48:08] (step=0546800) Train Loss: 0.1499, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 04:48:16] (step=0546900) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 04:48:25] (step=0547000) Train Loss: 0.1504, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-20 04:48:34] (step=0547100) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 04:48:42] (step=0547200) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:48:51] (step=0547300) Train Loss: 0.1507, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 04:49:00] (step=0547400) Train Loss: 0.1506, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 04:49:08] (step=0547500) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0064 +[2025-02-20 04:49:17] (step=0547600) Train Loss: 0.1490, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 04:49:26] (step=0547700) Train Loss: 0.1506, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 04:49:34] (step=0547800) Train Loss: 0.1508, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:49:43] (step=0547900) Train Loss: 0.1505, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 04:49:53] (step=0548000) Train Loss: 0.1505, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-20 04:50:02] (step=0548100) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 04:50:10] (step=0548200) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 04:50:19] (step=0548300) Train Loss: 0.1517, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:50:28] (step=0548400) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-20 04:50:36] (step=0548500) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 04:50:46] (step=0548600) Train Loss: 0.1493, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 04:50:55] (step=0548700) Train Loss: 0.1494, Train Steps/Sec: 10.85, Grad Norm: 0.0062 +[2025-02-20 04:51:04] (step=0548800) Train Loss: 0.1494, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 04:51:13] (step=0548900) Train Loss: 0.1482, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 04:51:21] (step=0549000) Train Loss: 0.1504, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 04:51:30] (step=0549100) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 04:51:40] (step=0549200) Train Loss: 0.1501, Train Steps/Sec: 9.66, Grad Norm: 0.0060 +[2025-02-20 04:51:49] (step=0549300) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 04:51:58] (step=0549400) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:52:06] (step=0549500) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:52:15] (step=0549600) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 04:52:23] (step=0549700) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 04:52:32] (step=0549800) Train Loss: 0.1507, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:52:41] (step=0549900) Train Loss: 0.1506, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 04:52:49] (step=0550000) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 04:52:58] (step=0550100) Train Loss: 0.1514, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 04:53:07] (step=0550200) Train Loss: 0.1495, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 04:53:15] (step=0550300) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:53:24] (step=0550400) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:53:34] (step=0550500) Train Loss: 0.1498, Train Steps/Sec: 9.91, Grad Norm: 0.0060 +[2025-02-20 04:53:43] (step=0550600) Train Loss: 0.1500, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 04:53:52] (step=0550700) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 04:54:01] (step=0550800) Train Loss: 0.1497, Train Steps/Sec: 10.58, Grad Norm: 0.0060 +[2025-02-20 04:54:11] (step=0550900) Train Loss: 0.1506, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 04:54:19] (step=0551000) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 04:54:28] (step=0551100) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 04:54:36] (step=0551200) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 04:54:45] (step=0551300) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 04:54:54] (step=0551400) Train Loss: 0.1506, Train Steps/Sec: 11.20, Grad Norm: 0.0062 +[2025-02-20 04:55:03] (step=0551500) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:55:11] (step=0551600) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 04:55:21] (step=0551700) Train Loss: 0.1502, Train Steps/Sec: 9.97, Grad Norm: 0.0060 +[2025-02-20 04:55:30] (step=0551800) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 04:55:38] (step=0551900) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 04:55:47] (step=0552000) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:55:56] (step=0552100) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 04:56:04] (step=0552200) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:56:13] (step=0552300) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 04:56:22] (step=0552400) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:56:30] (step=0552500) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 04:56:39] (step=0552600) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:56:48] (step=0552700) Train Loss: 0.1502, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 04:56:56] (step=0552800) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 04:57:05] (step=0552900) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:57:16] (step=0553000) Train Loss: 0.1498, Train Steps/Sec: 8.95, Grad Norm: 0.0066 +[2025-02-20 04:57:25] (step=0553100) Train Loss: 0.1482, Train Steps/Sec: 10.88, Grad Norm: 0.0060 +[2025-02-20 04:57:34] (step=0553200) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 04:57:43] (step=0553300) Train Loss: 0.1507, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 04:57:51] (step=0553400) Train Loss: 0.1508, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 04:58:00] (step=0553500) Train Loss: 0.1488, Train Steps/Sec: 11.54, Grad Norm: 0.0063 +[2025-02-20 04:58:09] (step=0553600) Train Loss: 0.1497, Train Steps/Sec: 11.15, Grad Norm: 0.0060 +[2025-02-20 04:58:18] (step=0553700) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 04:58:26] (step=0553800) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 04:58:35] (step=0553900) Train Loss: 0.1506, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 04:58:44] (step=0554000) Train Loss: 0.1513, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 04:58:52] (step=0554100) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 04:59:02] (step=0554200) Train Loss: 0.1488, Train Steps/Sec: 9.86, Grad Norm: 0.0064 +[2025-02-20 04:59:11] (step=0554300) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 04:59:20] (step=0554400) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 04:59:28] (step=0554500) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 04:59:37] (step=0554600) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 04:59:45] (step=0554700) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 04:59:54] (step=0554800) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:00:03] (step=0554900) Train Loss: 0.1495, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 05:00:12] (step=0555000) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:00:21] (step=0555100) Train Loss: 0.1500, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 05:00:30] (step=0555200) Train Loss: 0.1512, Train Steps/Sec: 10.55, Grad Norm: 0.0062 +[2025-02-20 05:00:39] (step=0555300) Train Loss: 0.1499, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 05:00:48] (step=0555400) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:00:58] (step=0555500) Train Loss: 0.1506, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-20 05:01:06] (step=0555600) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:01:15] (step=0555700) Train Loss: 0.1488, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-20 05:01:24] (step=0555800) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:01:33] (step=0555900) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:01:41] (step=0556000) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:01:50] (step=0556100) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:01:58] (step=0556200) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:02:07] (step=0556300) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:02:16] (step=0556400) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:02:24] (step=0556500) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:02:33] (step=0556600) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 05:02:43] (step=0556700) Train Loss: 0.1493, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 05:02:52] (step=0556800) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:03:00] (step=0556900) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 05:03:09] (step=0557000) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 05:03:18] (step=0557100) Train Loss: 0.1505, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 05:03:26] (step=0557200) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 05:03:36] (step=0557300) Train Loss: 0.1500, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 05:03:45] (step=0557400) Train Loss: 0.1502, Train Steps/Sec: 10.85, Grad Norm: 0.0064 +[2025-02-20 05:03:54] (step=0557500) Train Loss: 0.1495, Train Steps/Sec: 10.87, Grad Norm: 0.0062 +[2025-02-20 05:04:03] (step=0557600) Train Loss: 0.1498, Train Steps/Sec: 11.55, Grad Norm: 0.0065 +[2025-02-20 05:04:11] (step=0557700) Train Loss: 0.1503, Train Steps/Sec: 11.54, Grad Norm: 0.0059 +[2025-02-20 05:04:20] (step=0557800) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 05:04:29] (step=0557900) Train Loss: 0.1503, Train Steps/Sec: 11.22, Grad Norm: 0.0064 +[2025-02-20 05:04:39] (step=0558000) Train Loss: 0.1505, Train Steps/Sec: 10.00, Grad Norm: 0.0061 +[2025-02-20 05:04:48] (step=0558100) Train Loss: 0.1491, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 05:04:56] (step=0558200) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 05:05:05] (step=0558300) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0066 +[2025-02-20 05:05:13] (step=0558400) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 05:05:22] (step=0558500) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 05:05:31] (step=0558600) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 05:05:39] (step=0558700) Train Loss: 0.1511, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 05:05:48] (step=0558800) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 05:05:57] (step=0558900) Train Loss: 0.1508, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 05:06:05] (step=0559000) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 05:06:14] (step=0559100) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:06:24] (step=0559200) Train Loss: 0.1504, Train Steps/Sec: 9.71, Grad Norm: 0.0060 +[2025-02-20 05:06:33] (step=0559300) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:06:41] (step=0559400) Train Loss: 0.1525, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:06:51] (step=0559500) Train Loss: 0.1500, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-20 05:07:00] (step=0559600) Train Loss: 0.1494, Train Steps/Sec: 10.56, Grad Norm: 0.0061 +[2025-02-20 05:07:09] (step=0559700) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:07:18] (step=0559800) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:07:26] (step=0559900) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:07:35] (step=0560000) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:07:36] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0560000.pt +[2025-02-20 05:07:45] (step=0560100) Train Loss: 0.1506, Train Steps/Sec: 9.93, Grad Norm: 0.0060 +[2025-02-20 05:07:54] (step=0560200) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:08:02] (step=0560300) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:08:11] (step=0560400) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:08:21] (step=0560500) Train Loss: 0.1503, Train Steps/Sec: 10.03, Grad Norm: 0.0058 +[2025-02-20 05:08:29] (step=0560600) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:08:38] (step=0560700) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:08:47] (step=0560800) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:08:55] (step=0560900) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 05:09:04] (step=0561000) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:09:13] (step=0561100) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:09:21] (step=0561200) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:09:30] (step=0561300) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:09:39] (step=0561400) Train Loss: 0.1491, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 05:09:47] (step=0561500) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:09:56] (step=0561600) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:10:07] (step=0561700) Train Loss: 0.1501, Train Steps/Sec: 9.01, Grad Norm: 0.0061 +[2025-02-20 05:10:16] (step=0561800) Train Loss: 0.1499, Train Steps/Sec: 10.88, Grad Norm: 0.0067 +[2025-02-20 05:10:25] (step=0561900) Train Loss: 0.1492, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 05:10:33] (step=0562000) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 05:10:42] (step=0562100) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:10:51] (step=0562200) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:11:00] (step=0562300) Train Loss: 0.1494, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 05:11:08] (step=0562400) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:11:17] (step=0562500) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:11:25] (step=0562600) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:11:34] (step=0562700) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:11:43] (step=0562800) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:11:51] (step=0562900) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:12:01] (step=0563000) Train Loss: 0.1506, Train Steps/Sec: 9.95, Grad Norm: 0.0060 +[2025-02-20 05:12:10] (step=0563100) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:12:19] (step=0563200) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 05:12:27] (step=0563300) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:12:36] (step=0563400) Train Loss: 0.1520, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 05:12:45] (step=0563500) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:12:53] (step=0563600) Train Loss: 0.1503, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 05:13:02] (step=0563700) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 05:13:11] (step=0563800) Train Loss: 0.1509, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 05:13:20] (step=0563900) Train Loss: 0.1498, Train Steps/Sec: 10.55, Grad Norm: 0.0063 +[2025-02-20 05:13:30] (step=0564000) Train Loss: 0.1495, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 05:13:38] (step=0564100) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:13:47] (step=0564200) Train Loss: 0.1493, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 05:13:57] (step=0564300) Train Loss: 0.1502, Train Steps/Sec: 10.07, Grad Norm: 0.0060 +[2025-02-20 05:14:06] (step=0564400) Train Loss: 0.1497, Train Steps/Sec: 11.20, Grad Norm: 0.0064 +[2025-02-20 05:14:14] (step=0564500) Train Loss: 0.1525, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:14:23] (step=0564600) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:14:32] (step=0564700) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:14:40] (step=0564800) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:14:49] (step=0564900) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:14:57] (step=0565000) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:15:06] (step=0565100) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:15:15] (step=0565200) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 05:15:23] (step=0565300) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:15:32] (step=0565400) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:15:42] (step=0565500) Train Loss: 0.1502, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 05:15:51] (step=0565600) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:15:59] (step=0565700) Train Loss: 0.1506, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 05:16:08] (step=0565800) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:16:17] (step=0565900) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 05:16:26] (step=0566000) Train Loss: 0.1495, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-20 05:16:35] (step=0566100) Train Loss: 0.1498, Train Steps/Sec: 10.52, Grad Norm: 0.0062 +[2025-02-20 05:16:44] (step=0566200) Train Loss: 0.1501, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 05:16:53] (step=0566300) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 05:17:01] (step=0566400) Train Loss: 0.1495, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 05:17:10] (step=0566500) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 05:17:19] (step=0566600) Train Loss: 0.1492, Train Steps/Sec: 11.18, Grad Norm: 0.0063 +[2025-02-20 05:17:28] (step=0566700) Train Loss: 0.1496, Train Steps/Sec: 11.51, Grad Norm: 0.0060 +[2025-02-20 05:17:38] (step=0566800) Train Loss: 0.1517, Train Steps/Sec: 9.97, Grad Norm: 0.0065 +[2025-02-20 05:17:46] (step=0566900) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 05:17:55] (step=0567000) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:18:04] (step=0567100) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:18:12] (step=0567200) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:18:21] (step=0567300) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:18:30] (step=0567400) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:18:38] (step=0567500) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:18:47] (step=0567600) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:18:55] (step=0567700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 05:19:04] (step=0567800) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 05:19:13] (step=0567900) Train Loss: 0.1501, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 05:19:23] (step=0568000) Train Loss: 0.1510, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 05:19:31] (step=0568100) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:19:41] (step=0568200) Train Loss: 0.1498, Train Steps/Sec: 10.25, Grad Norm: 0.0060 +[2025-02-20 05:19:50] (step=0568300) Train Loss: 0.1503, Train Steps/Sec: 10.89, Grad Norm: 0.0060 +[2025-02-20 05:19:59] (step=0568400) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 05:20:08] (step=0568500) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 05:20:16] (step=0568600) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:20:25] (step=0568700) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:20:34] (step=0568800) Train Loss: 0.1485, Train Steps/Sec: 11.22, Grad Norm: 0.0058 +[2025-02-20 05:20:42] (step=0568900) Train Loss: 0.1515, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:20:51] (step=0569000) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:21:00] (step=0569100) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:21:08] (step=0569200) Train Loss: 0.1506, Train Steps/Sec: 11.57, Grad Norm: 0.0065 +[2025-02-20 05:21:18] (step=0569300) Train Loss: 0.1509, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-20 05:21:27] (step=0569400) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:21:36] (step=0569500) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 05:21:44] (step=0569600) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 05:21:53] (step=0569700) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:22:01] (step=0569800) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 05:22:10] (step=0569900) Train Loss: 0.1483, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 05:22:19] (step=0570000) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:22:27] (step=0570100) Train Loss: 0.1504, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 05:22:36] (step=0570200) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 05:22:45] (step=0570300) Train Loss: 0.1516, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:22:54] (step=0570400) Train Loss: 0.1497, Train Steps/Sec: 10.25, Grad Norm: 0.0059 +[2025-02-20 05:23:05] (step=0570500) Train Loss: 0.1507, Train Steps/Sec: 9.37, Grad Norm: 0.0061 +[2025-02-20 05:23:14] (step=0570600) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:23:22] (step=0570700) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:23:31] (step=0570800) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 05:23:40] (step=0570900) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 05:23:49] (step=0571000) Train Loss: 0.1480, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 05:23:57] (step=0571100) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:24:06] (step=0571200) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:24:14] (step=0571300) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:24:23] (step=0571400) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:24:32] (step=0571500) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 05:24:40] (step=0571600) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:24:49] (step=0571700) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 05:24:59] (step=0571800) Train Loss: 0.1505, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 05:25:07] (step=0571900) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:25:16] (step=0572000) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 05:25:25] (step=0572100) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0066 +[2025-02-20 05:25:34] (step=0572200) Train Loss: 0.1505, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 05:25:42] (step=0572300) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:25:51] (step=0572400) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:26:00] (step=0572500) Train Loss: 0.1495, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 05:26:09] (step=0572600) Train Loss: 0.1507, Train Steps/Sec: 10.87, Grad Norm: 0.0064 +[2025-02-20 05:26:18] (step=0572700) Train Loss: 0.1489, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 05:26:27] (step=0572800) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:26:36] (step=0572900) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:26:46] (step=0573000) Train Loss: 0.1501, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 05:26:55] (step=0573100) Train Loss: 0.1490, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 05:27:03] (step=0573200) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 05:27:12] (step=0573300) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:27:20] (step=0573400) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 05:27:29] (step=0573500) Train Loss: 0.1491, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 05:27:38] (step=0573600) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:27:46] (step=0573700) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:27:55] (step=0573800) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:28:04] (step=0573900) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:28:12] (step=0574000) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:28:21] (step=0574100) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:28:29] (step=0574200) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 05:28:39] (step=0574300) Train Loss: 0.1504, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-20 05:28:48] (step=0574400) Train Loss: 0.1501, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-20 05:28:57] (step=0574500) Train Loss: 0.1516, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:29:06] (step=0574600) Train Loss: 0.1489, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 05:29:15] (step=0574700) Train Loss: 0.1502, Train Steps/Sec: 10.91, Grad Norm: 0.0062 +[2025-02-20 05:29:25] (step=0574800) Train Loss: 0.1506, Train Steps/Sec: 10.26, Grad Norm: 0.0062 +[2025-02-20 05:29:33] (step=0574900) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:29:42] (step=0575000) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 05:29:50] (step=0575100) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:29:59] (step=0575200) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:30:08] (step=0575300) Train Loss: 0.1495, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 05:30:17] (step=0575400) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:30:27] (step=0575500) Train Loss: 0.1508, Train Steps/Sec: 9.98, Grad Norm: 0.0064 +[2025-02-20 05:30:35] (step=0575600) Train Loss: 0.1512, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:30:44] (step=0575700) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:30:53] (step=0575800) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:31:01] (step=0575900) Train Loss: 0.1515, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:31:10] (step=0576000) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:31:18] (step=0576100) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 05:31:27] (step=0576200) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:31:36] (step=0576300) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 05:31:44] (step=0576400) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:31:53] (step=0576500) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:32:02] (step=0576600) Train Loss: 0.1513, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 05:32:10] (step=0576700) Train Loss: 0.1489, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 05:32:20] (step=0576800) Train Loss: 0.1510, Train Steps/Sec: 10.01, Grad Norm: 0.0062 +[2025-02-20 05:32:30] (step=0576900) Train Loss: 0.1496, Train Steps/Sec: 10.25, Grad Norm: 0.0063 +[2025-02-20 05:32:39] (step=0577000) Train Loss: 0.1500, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-20 05:32:48] (step=0577100) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:32:57] (step=0577200) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:33:05] (step=0577300) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 05:33:14] (step=0577400) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:33:23] (step=0577500) Train Loss: 0.1505, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 05:33:31] (step=0577600) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 05:33:40] (step=0577700) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:33:49] (step=0577800) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:33:57] (step=0577900) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 05:34:07] (step=0578000) Train Loss: 0.1500, Train Steps/Sec: 10.00, Grad Norm: 0.0063 +[2025-02-20 05:34:16] (step=0578100) Train Loss: 0.1484, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 05:34:25] (step=0578200) Train Loss: 0.1493, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 05:34:33] (step=0578300) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:34:42] (step=0578400) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:34:50] (step=0578500) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 05:34:59] (step=0578600) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:35:08] (step=0578700) Train Loss: 0.1499, Train Steps/Sec: 11.25, Grad Norm: 0.0064 +[2025-02-20 05:35:17] (step=0578800) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:35:25] (step=0578900) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:35:34] (step=0579000) Train Loss: 0.1489, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 05:35:44] (step=0579100) Train Loss: 0.1508, Train Steps/Sec: 10.54, Grad Norm: 0.0062 +[2025-02-20 05:35:53] (step=0579200) Train Loss: 0.1500, Train Steps/Sec: 10.84, Grad Norm: 0.0059 +[2025-02-20 05:36:03] (step=0579300) Train Loss: 0.1515, Train Steps/Sec: 9.95, Grad Norm: 0.0061 +[2025-02-20 05:36:12] (step=0579400) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 05:36:20] (step=0579500) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:36:29] (step=0579600) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:36:38] (step=0579700) Train Loss: 0.1503, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 05:36:46] (step=0579800) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:36:55] (step=0579900) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:37:04] (step=0580000) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:37:05] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0580000.pt +[2025-02-20 05:37:13] (step=0580100) Train Loss: 0.1505, Train Steps/Sec: 10.37, Grad Norm: 0.0060 +[2025-02-20 05:37:22] (step=0580200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 05:37:30] (step=0580300) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:37:39] (step=0580400) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:37:49] (step=0580500) Train Loss: 0.1501, Train Steps/Sec: 9.99, Grad Norm: 0.0063 +[2025-02-20 05:37:58] (step=0580600) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:38:06] (step=0580700) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:38:15] (step=0580800) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:38:24] (step=0580900) Train Loss: 0.1503, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 05:38:32] (step=0581000) Train Loss: 0.1517, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:38:41] (step=0581100) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:38:50] (step=0581200) Train Loss: 0.1506, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-20 05:38:59] (step=0581300) Train Loss: 0.1508, Train Steps/Sec: 10.87, Grad Norm: 0.0061 +[2025-02-20 05:39:09] (step=0581400) Train Loss: 0.1499, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 05:39:17] (step=0581500) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 05:39:26] (step=0581600) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 05:39:35] (step=0581700) Train Loss: 0.1499, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 05:39:45] (step=0581800) Train Loss: 0.1505, Train Steps/Sec: 9.72, Grad Norm: 0.0062 +[2025-02-20 05:39:53] (step=0581900) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:40:02] (step=0582000) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 05:40:11] (step=0582100) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:40:19] (step=0582200) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:40:28] (step=0582300) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 05:40:37] (step=0582400) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:40:45] (step=0582500) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:40:54] (step=0582600) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:41:02] (step=0582700) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 05:41:11] (step=0582800) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:41:20] (step=0582900) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 05:41:30] (step=0583000) Train Loss: 0.1495, Train Steps/Sec: 9.71, Grad Norm: 0.0060 +[2025-02-20 05:41:39] (step=0583100) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:41:47] (step=0583200) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:41:56] (step=0583300) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:42:05] (step=0583400) Train Loss: 0.1492, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-20 05:42:15] (step=0583500) Train Loss: 0.1484, Train Steps/Sec: 10.26, Grad Norm: 0.0063 +[2025-02-20 05:42:23] (step=0583600) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:42:32] (step=0583700) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 05:42:40] (step=0583800) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:42:49] (step=0583900) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 05:42:58] (step=0584000) Train Loss: 0.1490, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 05:43:07] (step=0584100) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 05:43:15] (step=0584200) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 05:43:25] (step=0584300) Train Loss: 0.1495, Train Steps/Sec: 10.05, Grad Norm: 0.0062 +[2025-02-20 05:43:34] (step=0584400) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:43:42] (step=0584500) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:43:51] (step=0584600) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:44:00] (step=0584700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:44:08] (step=0584800) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:44:17] (step=0584900) Train Loss: 0.1516, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:44:26] (step=0585000) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:44:34] (step=0585100) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:44:43] (step=0585200) Train Loss: 0.1492, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 05:44:52] (step=0585300) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:45:00] (step=0585400) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:45:10] (step=0585500) Train Loss: 0.1493, Train Steps/Sec: 9.99, Grad Norm: 0.0059 +[2025-02-20 05:45:20] (step=0585600) Train Loss: 0.1506, Train Steps/Sec: 10.27, Grad Norm: 0.0061 +[2025-02-20 05:45:29] (step=0585700) Train Loss: 0.1496, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 05:45:38] (step=0585800) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:45:46] (step=0585900) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:45:55] (step=0586000) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:46:04] (step=0586100) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:46:13] (step=0586200) Train Loss: 0.1500, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 05:46:21] (step=0586300) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 05:46:30] (step=0586400) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 05:46:38] (step=0586500) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:46:47] (step=0586600) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:46:56] (step=0586700) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 05:47:06] (step=0586800) Train Loss: 0.1519, Train Steps/Sec: 10.08, Grad Norm: 0.0059 +[2025-02-20 05:47:14] (step=0586900) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:47:23] (step=0587000) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 05:47:32] (step=0587100) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 05:47:40] (step=0587200) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:47:49] (step=0587300) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 05:47:58] (step=0587400) Train Loss: 0.1489, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 05:48:06] (step=0587500) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 05:48:15] (step=0587600) Train Loss: 0.1520, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 05:48:24] (step=0587700) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 05:48:33] (step=0587800) Train Loss: 0.1512, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-20 05:48:42] (step=0587900) Train Loss: 0.1487, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 05:48:52] (step=0588000) Train Loss: 0.1505, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 05:49:01] (step=0588100) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 05:49:10] (step=0588200) Train Loss: 0.1513, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:49:18] (step=0588300) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:49:27] (step=0588400) Train Loss: 0.1493, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 05:49:36] (step=0588500) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 05:49:44] (step=0588600) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:49:53] (step=0588700) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 05:50:02] (step=0588800) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:50:10] (step=0588900) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:50:19] (step=0589000) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:50:27] (step=0589100) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:50:36] (step=0589200) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 05:50:46] (step=0589300) Train Loss: 0.1493, Train Steps/Sec: 10.09, Grad Norm: 0.0062 +[2025-02-20 05:50:55] (step=0589400) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 05:51:03] (step=0589500) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:51:12] (step=0589600) Train Loss: 0.1511, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 05:51:21] (step=0589700) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 05:51:29] (step=0589800) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:51:39] (step=0589900) Train Loss: 0.1506, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 05:51:48] (step=0590000) Train Loss: 0.1507, Train Steps/Sec: 10.87, Grad Norm: 0.0064 +[2025-02-20 05:51:57] (step=0590100) Train Loss: 0.1510, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-20 05:52:05] (step=0590200) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 05:52:14] (step=0590300) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:52:23] (step=0590400) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:52:33] (step=0590500) Train Loss: 0.1500, Train Steps/Sec: 9.70, Grad Norm: 0.0060 +[2025-02-20 05:52:42] (step=0590600) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 05:52:50] (step=0590700) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:52:59] (step=0590800) Train Loss: 0.1518, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 05:53:08] (step=0590900) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 05:53:16] (step=0591000) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:53:25] (step=0591100) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 05:53:33] (step=0591200) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 05:53:42] (step=0591300) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 05:53:51] (step=0591400) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 05:53:59] (step=0591500) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:54:08] (step=0591600) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:54:17] (step=0591700) Train Loss: 0.1496, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-20 05:54:27] (step=0591800) Train Loss: 0.1494, Train Steps/Sec: 10.05, Grad Norm: 0.0059 +[2025-02-20 05:54:35] (step=0591900) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 05:54:44] (step=0592000) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 05:54:53] (step=0592100) Train Loss: 0.1494, Train Steps/Sec: 10.96, Grad Norm: 0.0063 +[2025-02-20 05:55:03] (step=0592200) Train Loss: 0.1507, Train Steps/Sec: 10.27, Grad Norm: 0.0062 +[2025-02-20 05:55:12] (step=0592300) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:55:20] (step=0592400) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:55:29] (step=0592500) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:55:37] (step=0592600) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 05:55:46] (step=0592700) Train Loss: 0.1490, Train Steps/Sec: 11.21, Grad Norm: 0.0062 +[2025-02-20 05:55:55] (step=0592800) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:56:04] (step=0592900) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:56:14] (step=0593000) Train Loss: 0.1495, Train Steps/Sec: 9.99, Grad Norm: 0.0065 +[2025-02-20 05:56:22] (step=0593100) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:56:31] (step=0593200) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:56:39] (step=0593300) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:56:48] (step=0593400) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:56:57] (step=0593500) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:57:05] (step=0593600) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 05:57:14] (step=0593700) Train Loss: 0.1514, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 05:57:23] (step=0593800) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 05:57:31] (step=0593900) Train Loss: 0.1503, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 05:57:40] (step=0594000) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 05:57:49] (step=0594100) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 05:57:57] (step=0594200) Train Loss: 0.1506, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 05:58:08] (step=0594300) Train Loss: 0.1506, Train Steps/Sec: 9.02, Grad Norm: 0.0062 +[2025-02-20 05:58:18] (step=0594400) Train Loss: 0.1500, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 05:58:26] (step=0594500) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 05:58:35] (step=0594600) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:58:44] (step=0594700) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 05:58:52] (step=0594800) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:59:01] (step=0594900) Train Loss: 0.1498, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-20 05:59:10] (step=0595000) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 05:59:18] (step=0595100) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 05:59:27] (step=0595200) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:59:36] (step=0595300) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 05:59:44] (step=0595400) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 05:59:54] (step=0595500) Train Loss: 0.1495, Train Steps/Sec: 9.94, Grad Norm: 0.0063 +[2025-02-20 06:00:03] (step=0595600) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:00:11] (step=0595700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:00:20] (step=0595800) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:00:29] (step=0595900) Train Loss: 0.1514, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:00:38] (step=0596000) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0064 +[2025-02-20 06:00:46] (step=0596100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 06:00:55] (step=0596200) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 06:01:03] (step=0596300) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:01:12] (step=0596400) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 06:01:22] (step=0596500) Train Loss: 0.1489, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 06:01:31] (step=0596600) Train Loss: 0.1489, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 06:01:40] (step=0596700) Train Loss: 0.1513, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 06:01:50] (step=0596800) Train Loss: 0.1505, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 06:01:58] (step=0596900) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:02:07] (step=0597000) Train Loss: 0.1504, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 06:02:16] (step=0597100) Train Loss: 0.1506, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 06:02:24] (step=0597200) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 06:02:33] (step=0597300) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 06:02:42] (step=0597400) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:02:50] (step=0597500) Train Loss: 0.1483, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:02:59] (step=0597600) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:03:08] (step=0597700) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 06:03:16] (step=0597800) Train Loss: 0.1487, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:03:25] (step=0597900) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:03:35] (step=0598000) Train Loss: 0.1503, Train Steps/Sec: 10.05, Grad Norm: 0.0061 +[2025-02-20 06:03:43] (step=0598100) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:03:52] (step=0598200) Train Loss: 0.1484, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 06:04:01] (step=0598300) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 06:04:10] (step=0598400) Train Loss: 0.1519, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 06:04:18] (step=0598500) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:04:27] (step=0598600) Train Loss: 0.1504, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-20 06:04:37] (step=0598700) Train Loss: 0.1494, Train Steps/Sec: 10.86, Grad Norm: 0.0066 +[2025-02-20 06:04:46] (step=0598800) Train Loss: 0.1492, Train Steps/Sec: 10.89, Grad Norm: 0.0060 +[2025-02-20 06:04:54] (step=0598900) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 06:05:03] (step=0599000) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 06:05:12] (step=0599100) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:05:20] (step=0599200) Train Loss: 0.1514, Train Steps/Sec: 11.54, Grad Norm: 0.0062 +[2025-02-20 06:05:31] (step=0599300) Train Loss: 0.1493, Train Steps/Sec: 9.76, Grad Norm: 0.0059 +[2025-02-20 06:05:39] (step=0599400) Train Loss: 0.1521, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:05:48] (step=0599500) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 06:05:56] (step=0599600) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:06:05] (step=0599700) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:06:14] (step=0599800) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:06:22] (step=0599900) Train Loss: 0.1479, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:06:31] (step=0600000) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:06:32] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0600000.pt +[2025-02-20 06:06:40] (step=0600100) Train Loss: 0.1489, Train Steps/Sec: 10.61, Grad Norm: 0.0060 +[2025-02-20 06:06:49] (step=0600200) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:06:58] (step=0600300) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:07:06] (step=0600400) Train Loss: 0.1491, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 06:07:16] (step=0600500) Train Loss: 0.1496, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 06:07:25] (step=0600600) Train Loss: 0.1507, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 06:07:34] (step=0600700) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:07:43] (step=0600800) Train Loss: 0.1500, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 06:07:53] (step=0600900) Train Loss: 0.1503, Train Steps/Sec: 10.54, Grad Norm: 0.0061 +[2025-02-20 06:08:01] (step=0601000) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 06:08:10] (step=0601100) Train Loss: 0.1507, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:08:19] (step=0601200) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:08:27] (step=0601300) Train Loss: 0.1508, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 06:08:36] (step=0601400) Train Loss: 0.1491, Train Steps/Sec: 11.18, Grad Norm: 0.0060 +[2025-02-20 06:08:45] (step=0601500) Train Loss: 0.1497, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:08:53] (step=0601600) Train Loss: 0.1489, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:09:02] (step=0601700) Train Loss: 0.1504, Train Steps/Sec: 11.51, Grad Norm: 0.0061 +[2025-02-20 06:09:12] (step=0601800) Train Loss: 0.1499, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-20 06:09:21] (step=0601900) Train Loss: 0.1513, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 06:09:30] (step=0602000) Train Loss: 0.1504, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 06:09:38] (step=0602100) Train Loss: 0.1520, Train Steps/Sec: 11.55, Grad Norm: 0.0064 +[2025-02-20 06:09:47] (step=0602200) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:09:55] (step=0602300) Train Loss: 0.1505, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:10:04] (step=0602400) Train Loss: 0.1515, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 06:10:13] (step=0602500) Train Loss: 0.1513, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 06:10:22] (step=0602600) Train Loss: 0.1506, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-20 06:10:30] (step=0602700) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 06:10:39] (step=0602800) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:10:48] (step=0602900) Train Loss: 0.1506, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 06:10:58] (step=0603000) Train Loss: 0.1500, Train Steps/Sec: 9.33, Grad Norm: 0.0061 +[2025-02-20 06:11:08] (step=0603100) Train Loss: 0.1500, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 06:11:16] (step=0603200) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:11:25] (step=0603300) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:11:34] (step=0603400) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-20 06:11:42] (step=0603500) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:11:51] (step=0603600) Train Loss: 0.1488, Train Steps/Sec: 11.23, Grad Norm: 0.0058 +[2025-02-20 06:12:00] (step=0603700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:12:08] (step=0603800) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:12:17] (step=0603900) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:12:26] (step=0604000) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:12:34] (step=0604100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:12:43] (step=0604200) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:12:53] (step=0604300) Train Loss: 0.1496, Train Steps/Sec: 9.94, Grad Norm: 0.0062 +[2025-02-20 06:13:01] (step=0604400) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:13:10] (step=0604500) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:13:19] (step=0604600) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:13:28] (step=0604700) Train Loss: 0.1497, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 06:13:36] (step=0604800) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:13:45] (step=0604900) Train Loss: 0.1515, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:13:54] (step=0605000) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:14:02] (step=0605100) Train Loss: 0.1497, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 06:14:12] (step=0605200) Train Loss: 0.1502, Train Steps/Sec: 10.53, Grad Norm: 0.0062 +[2025-02-20 06:14:21] (step=0605300) Train Loss: 0.1485, Train Steps/Sec: 10.87, Grad Norm: 0.0063 +[2025-02-20 06:14:30] (step=0605400) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:14:40] (step=0605500) Train Loss: 0.1510, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-20 06:14:48] (step=0605600) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 06:14:57] (step=0605700) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:15:06] (step=0605800) Train Loss: 0.1510, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 06:15:14] (step=0605900) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:15:23] (step=0606000) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 06:15:32] (step=0606100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 06:15:40] (step=0606200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 06:15:49] (step=0606300) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:15:58] (step=0606400) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:16:06] (step=0606500) Train Loss: 0.1484, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:16:15] (step=0606600) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:16:23] (step=0606700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:16:33] (step=0606800) Train Loss: 0.1500, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 06:16:42] (step=0606900) Train Loss: 0.1503, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 06:16:51] (step=0607000) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:16:59] (step=0607100) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:17:08] (step=0607200) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 06:17:17] (step=0607300) Train Loss: 0.1512, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-20 06:17:27] (step=0607400) Train Loss: 0.1503, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-20 06:17:36] (step=0607500) Train Loss: 0.1503, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 06:17:44] (step=0607600) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 06:17:53] (step=0607700) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:18:01] (step=0607800) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 06:18:10] (step=0607900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:18:20] (step=0608000) Train Loss: 0.1505, Train Steps/Sec: 9.72, Grad Norm: 0.0060 +[2025-02-20 06:18:29] (step=0608100) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:18:38] (step=0608200) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:18:46] (step=0608300) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:18:55] (step=0608400) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 06:19:03] (step=0608500) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:19:12] (step=0608600) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 06:19:21] (step=0608700) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 06:19:29] (step=0608800) Train Loss: 0.1511, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 06:19:38] (step=0608900) Train Loss: 0.1511, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 06:19:47] (step=0609000) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:19:56] (step=0609100) Train Loss: 0.1498, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 06:20:04] (step=0609200) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 06:20:14] (step=0609300) Train Loss: 0.1487, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 06:20:23] (step=0609400) Train Loss: 0.1485, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 06:20:33] (step=0609500) Train Loss: 0.1496, Train Steps/Sec: 10.24, Grad Norm: 0.0059 +[2025-02-20 06:20:42] (step=0609600) Train Loss: 0.1491, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-20 06:20:50] (step=0609700) Train Loss: 0.1508, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:20:59] (step=0609800) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 06:21:08] (step=0609900) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 06:21:16] (step=0610000) Train Loss: 0.1490, Train Steps/Sec: 11.58, Grad Norm: 0.0065 +[2025-02-20 06:21:25] (step=0610100) Train Loss: 0.1509, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-20 06:21:34] (step=0610200) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 06:21:43] (step=0610300) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:21:51] (step=0610400) Train Loss: 0.1489, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:22:01] (step=0610500) Train Loss: 0.1496, Train Steps/Sec: 9.88, Grad Norm: 0.0059 +[2025-02-20 06:22:10] (step=0610600) Train Loss: 0.1520, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 06:22:19] (step=0610700) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 06:22:27] (step=0610800) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 06:22:36] (step=0610900) Train Loss: 0.1504, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 06:22:45] (step=0611000) Train Loss: 0.1495, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 06:22:53] (step=0611100) Train Loss: 0.1504, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 06:23:02] (step=0611200) Train Loss: 0.1494, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 06:23:11] (step=0611300) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:23:19] (step=0611400) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 06:23:28] (step=0611500) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 06:23:37] (step=0611600) Train Loss: 0.1497, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 06:23:46] (step=0611700) Train Loss: 0.1500, Train Steps/Sec: 10.53, Grad Norm: 0.0060 +[2025-02-20 06:23:57] (step=0611800) Train Loss: 0.1513, Train Steps/Sec: 9.45, Grad Norm: 0.0062 +[2025-02-20 06:24:06] (step=0611900) Train Loss: 0.1520, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:24:14] (step=0612000) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:24:23] (step=0612100) Train Loss: 0.1491, Train Steps/Sec: 11.55, Grad Norm: 0.0059 +[2025-02-20 06:24:32] (step=0612200) Train Loss: 0.1511, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:24:41] (step=0612300) Train Loss: 0.1498, Train Steps/Sec: 11.19, Grad Norm: 0.0064 +[2025-02-20 06:24:49] (step=0612400) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:24:58] (step=0612500) Train Loss: 0.1504, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:25:07] (step=0612600) Train Loss: 0.1504, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 06:25:15] (step=0612700) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:25:24] (step=0612800) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:25:33] (step=0612900) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:25:43] (step=0613000) Train Loss: 0.1505, Train Steps/Sec: 9.90, Grad Norm: 0.0061 +[2025-02-20 06:25:51] (step=0613100) Train Loss: 0.1511, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 06:26:00] (step=0613200) Train Loss: 0.1509, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:26:09] (step=0613300) Train Loss: 0.1500, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 06:26:17] (step=0613400) Train Loss: 0.1513, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 06:26:26] (step=0613500) Train Loss: 0.1507, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 06:26:35] (step=0613600) Train Loss: 0.1499, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 06:26:43] (step=0613700) Train Loss: 0.1499, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 06:26:52] (step=0613800) Train Loss: 0.1513, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 06:27:02] (step=0613900) Train Loss: 0.1498, Train Steps/Sec: 10.53, Grad Norm: 0.0061 +[2025-02-20 06:27:11] (step=0614000) Train Loss: 0.1506, Train Steps/Sec: 10.85, Grad Norm: 0.0060 +[2025-02-20 06:27:20] (step=0614100) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 06:27:28] (step=0614200) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 06:27:38] (step=0614300) Train Loss: 0.1495, Train Steps/Sec: 9.96, Grad Norm: 0.0062 +[2025-02-20 06:27:47] (step=0614400) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 06:27:56] (step=0614500) Train Loss: 0.1496, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-20 06:28:05] (step=0614600) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:28:13] (step=0614700) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:28:22] (step=0614800) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:28:31] (step=0614900) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 06:28:39] (step=0615000) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 06:28:48] (step=0615100) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:28:56] (step=0615200) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 06:29:05] (step=0615300) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 06:29:14] (step=0615400) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 06:29:24] (step=0615500) Train Loss: 0.1484, Train Steps/Sec: 9.66, Grad Norm: 0.0064 +[2025-02-20 06:29:33] (step=0615600) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:29:41] (step=0615700) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:29:50] (step=0615800) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 06:29:59] (step=0615900) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:30:08] (step=0616000) Train Loss: 0.1507, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 06:30:17] (step=0616100) Train Loss: 0.1503, Train Steps/Sec: 10.87, Grad Norm: 0.0064 +[2025-02-20 06:30:26] (step=0616200) Train Loss: 0.1509, Train Steps/Sec: 10.92, Grad Norm: 0.0064 +[2025-02-20 06:30:35] (step=0616300) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:30:43] (step=0616400) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:30:52] (step=0616500) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 06:31:01] (step=0616600) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 06:31:09] (step=0616700) Train Loss: 0.1491, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 06:31:19] (step=0616800) Train Loss: 0.1501, Train Steps/Sec: 9.97, Grad Norm: 0.0060 +[2025-02-20 06:31:28] (step=0616900) Train Loss: 0.1495, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 06:31:37] (step=0617000) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:31:45] (step=0617100) Train Loss: 0.1498, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:31:54] (step=0617200) Train Loss: 0.1504, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 06:32:03] (step=0617300) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:32:11] (step=0617400) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:32:20] (step=0617500) Train Loss: 0.1509, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:32:29] (step=0617600) Train Loss: 0.1493, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:32:38] (step=0617700) Train Loss: 0.1495, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 06:32:46] (step=0617800) Train Loss: 0.1497, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:32:55] (step=0617900) Train Loss: 0.1502, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 06:33:05] (step=0618000) Train Loss: 0.1489, Train Steps/Sec: 10.02, Grad Norm: 0.0059 +[2025-02-20 06:33:14] (step=0618100) Train Loss: 0.1508, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:33:23] (step=0618200) Train Loss: 0.1498, Train Steps/Sec: 10.56, Grad Norm: 0.0060 +[2025-02-20 06:33:32] (step=0618300) Train Loss: 0.1513, Train Steps/Sec: 10.87, Grad Norm: 0.0060 +[2025-02-20 06:33:41] (step=0618400) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:33:50] (step=0618500) Train Loss: 0.1512, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:33:58] (step=0618600) Train Loss: 0.1508, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 06:34:07] (step=0618700) Train Loss: 0.1507, Train Steps/Sec: 11.53, Grad Norm: 0.0064 +[2025-02-20 06:34:16] (step=0618800) Train Loss: 0.1477, Train Steps/Sec: 11.15, Grad Norm: 0.0059 +[2025-02-20 06:34:24] (step=0618900) Train Loss: 0.1493, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:34:33] (step=0619000) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:34:42] (step=0619100) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0067 +[2025-02-20 06:34:50] (step=0619200) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 06:35:01] (step=0619300) Train Loss: 0.1508, Train Steps/Sec: 9.92, Grad Norm: 0.0063 +[2025-02-20 06:35:09] (step=0619400) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:35:18] (step=0619500) Train Loss: 0.1508, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:35:26] (step=0619600) Train Loss: 0.1522, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 06:35:35] (step=0619700) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:35:44] (step=0619800) Train Loss: 0.1488, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:35:53] (step=0619900) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 06:36:01] (step=0620000) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 06:36:02] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0620000.pt +[2025-02-20 06:36:11] (step=0620100) Train Loss: 0.1512, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 06:36:19] (step=0620200) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:36:28] (step=0620300) Train Loss: 0.1501, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 06:36:38] (step=0620400) Train Loss: 0.1515, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-20 06:36:48] (step=0620500) Train Loss: 0.1497, Train Steps/Sec: 9.43, Grad Norm: 0.0060 +[2025-02-20 06:36:57] (step=0620600) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:37:06] (step=0620700) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 06:37:14] (step=0620800) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 06:37:23] (step=0620900) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:37:32] (step=0621000) Train Loss: 0.1505, Train Steps/Sec: 11.22, Grad Norm: 0.0059 +[2025-02-20 06:37:40] (step=0621100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:37:49] (step=0621200) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:37:58] (step=0621300) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:38:06] (step=0621400) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 06:38:15] (step=0621500) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:38:23] (step=0621600) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:38:32] (step=0621700) Train Loss: 0.1517, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:38:42] (step=0621800) Train Loss: 0.1498, Train Steps/Sec: 9.98, Grad Norm: 0.0064 +[2025-02-20 06:38:51] (step=0621900) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:39:00] (step=0622000) Train Loss: 0.1513, Train Steps/Sec: 11.24, Grad Norm: 0.0066 +[2025-02-20 06:39:08] (step=0622100) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:39:17] (step=0622200) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:39:25] (step=0622300) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:39:34] (step=0622400) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 06:39:43] (step=0622500) Train Loss: 0.1514, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 06:39:52] (step=0622600) Train Loss: 0.1496, Train Steps/Sec: 10.85, Grad Norm: 0.0063 +[2025-02-20 06:40:02] (step=0622700) Train Loss: 0.1502, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 06:40:10] (step=0622800) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:40:19] (step=0622900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:40:29] (step=0623000) Train Loss: 0.1493, Train Steps/Sec: 9.93, Grad Norm: 0.0061 +[2025-02-20 06:40:38] (step=0623100) Train Loss: 0.1487, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 06:40:47] (step=0623200) Train Loss: 0.1516, Train Steps/Sec: 11.19, Grad Norm: 0.0059 +[2025-02-20 06:40:55] (step=0623300) Train Loss: 0.1494, Train Steps/Sec: 11.54, Grad Norm: 0.0059 +[2025-02-20 06:41:04] (step=0623400) Train Loss: 0.1484, Train Steps/Sec: 11.55, Grad Norm: 0.0059 +[2025-02-20 06:41:13] (step=0623500) Train Loss: 0.1501, Train Steps/Sec: 11.55, Grad Norm: 0.0059 +[2025-02-20 06:41:21] (step=0623600) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:41:30] (step=0623700) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 06:41:38] (step=0623800) Train Loss: 0.1497, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 06:41:47] (step=0623900) Train Loss: 0.1498, Train Steps/Sec: 11.55, Grad Norm: 0.0059 +[2025-02-20 06:41:56] (step=0624000) Train Loss: 0.1503, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 06:42:04] (step=0624100) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:42:13] (step=0624200) Train Loss: 0.1495, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 06:42:23] (step=0624300) Train Loss: 0.1492, Train Steps/Sec: 9.99, Grad Norm: 0.0063 +[2025-02-20 06:42:32] (step=0624400) Train Loss: 0.1511, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 06:42:41] (step=0624500) Train Loss: 0.1513, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:42:49] (step=0624600) Train Loss: 0.1503, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 06:42:59] (step=0624700) Train Loss: 0.1502, Train Steps/Sec: 10.56, Grad Norm: 0.0061 +[2025-02-20 06:43:08] (step=0624800) Train Loss: 0.1508, Train Steps/Sec: 10.51, Grad Norm: 0.0060 +[2025-02-20 06:43:17] (step=0624900) Train Loss: 0.1499, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:43:26] (step=0625000) Train Loss: 0.1508, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:43:34] (step=0625100) Train Loss: 0.1511, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:43:43] (step=0625200) Train Loss: 0.1509, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 06:43:52] (step=0625300) Train Loss: 0.1496, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:44:01] (step=0625400) Train Loss: 0.1513, Train Steps/Sec: 11.18, Grad Norm: 0.0059 +[2025-02-20 06:44:10] (step=0625500) Train Loss: 0.1499, Train Steps/Sec: 10.80, Grad Norm: 0.0059 +[2025-02-20 06:44:19] (step=0625600) Train Loss: 0.1480, Train Steps/Sec: 10.58, Grad Norm: 0.0061 +[2025-02-20 06:44:28] (step=0625700) Train Loss: 0.1496, Train Steps/Sec: 11.55, Grad Norm: 0.0057 +[2025-02-20 06:44:37] (step=0625800) Train Loss: 0.1489, Train Steps/Sec: 11.54, Grad Norm: 0.0067 +[2025-02-20 06:44:45] (step=0625900) Train Loss: 0.1496, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 06:44:54] (step=0626000) Train Loss: 0.1500, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 06:45:03] (step=0626100) Train Loss: 0.1509, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 06:45:11] (step=0626200) Train Loss: 0.1504, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 06:45:20] (step=0626300) Train Loss: 0.1506, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 06:45:29] (step=0626400) Train Loss: 0.1500, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 06:45:37] (step=0626500) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 06:45:46] (step=0626600) Train Loss: 0.1514, Train Steps/Sec: 11.55, Grad Norm: 0.0058 +[2025-02-20 06:45:55] (step=0626700) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 06:46:05] (step=0626800) Train Loss: 0.1506, Train Steps/Sec: 9.73, Grad Norm: 0.0060 +[2025-02-20 06:46:15] (step=0626900) Train Loss: 0.1511, Train Steps/Sec: 10.54, Grad Norm: 0.0061 +[2025-02-20 06:46:24] (step=0627000) Train Loss: 0.1502, Train Steps/Sec: 10.87, Grad Norm: 0.0060 +[2025-02-20 06:46:32] (step=0627100) Train Loss: 0.1499, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 06:46:41] (step=0627200) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 06:46:50] (step=0627300) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:46:58] (step=0627400) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:47:07] (step=0627500) Train Loss: 0.1494, Train Steps/Sec: 11.22, Grad Norm: 0.0059 +[2025-02-20 06:47:16] (step=0627600) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:47:24] (step=0627700) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 06:47:33] (step=0627800) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:47:42] (step=0627900) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 06:47:50] (step=0628000) Train Loss: 0.1509, Train Steps/Sec: 11.56, Grad Norm: 0.0065 +[2025-02-20 06:48:00] (step=0628100) Train Loss: 0.1506, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 06:48:09] (step=0628200) Train Loss: 0.1482, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:48:18] (step=0628300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:48:26] (step=0628400) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:48:35] (step=0628500) Train Loss: 0.1501, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 06:48:44] (step=0628600) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 06:48:52] (step=0628700) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 06:49:01] (step=0628800) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 06:49:10] (step=0628900) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 06:49:18] (step=0629000) Train Loss: 0.1519, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 06:49:28] (step=0629100) Train Loss: 0.1515, Train Steps/Sec: 10.55, Grad Norm: 0.0064 +[2025-02-20 06:49:37] (step=0629200) Train Loss: 0.1511, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 06:49:47] (step=0629300) Train Loss: 0.1507, Train Steps/Sec: 10.07, Grad Norm: 0.0060 +[2025-02-20 06:49:56] (step=0629400) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:50:04] (step=0629500) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:50:13] (step=0629600) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 06:50:22] (step=0629700) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 06:50:30] (step=0629800) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:50:39] (step=0629900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:50:48] (step=0630000) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 06:50:56] (step=0630100) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:51:05] (step=0630200) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:51:14] (step=0630300) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:51:22] (step=0630400) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:51:31] (step=0630500) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 06:51:41] (step=0630600) Train Loss: 0.1502, Train Steps/Sec: 10.10, Grad Norm: 0.0061 +[2025-02-20 06:51:50] (step=0630700) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 06:51:58] (step=0630800) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 06:52:07] (step=0630900) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:52:15] (step=0631000) Train Loss: 0.1487, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:52:24] (step=0631100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:52:33] (step=0631200) Train Loss: 0.1490, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 06:52:42] (step=0631300) Train Loss: 0.1494, Train Steps/Sec: 10.86, Grad Norm: 0.0061 +[2025-02-20 06:52:52] (step=0631400) Train Loss: 0.1502, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-20 06:53:00] (step=0631500) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:53:09] (step=0631600) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:53:17] (step=0631700) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:53:27] (step=0631800) Train Loss: 0.1505, Train Steps/Sec: 10.05, Grad Norm: 0.0061 +[2025-02-20 06:53:36] (step=0631900) Train Loss: 0.1510, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 06:53:45] (step=0632000) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:53:54] (step=0632100) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 06:54:02] (step=0632200) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 06:54:11] (step=0632300) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 06:54:19] (step=0632400) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 06:54:28] (step=0632500) Train Loss: 0.1517, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 06:54:37] (step=0632600) Train Loss: 0.1507, Train Steps/Sec: 11.53, Grad Norm: 0.0060 +[2025-02-20 06:54:46] (step=0632700) Train Loss: 0.1516, Train Steps/Sec: 11.53, Grad Norm: 0.0061 +[2025-02-20 06:54:54] (step=0632800) Train Loss: 0.1506, Train Steps/Sec: 11.53, Grad Norm: 0.0063 +[2025-02-20 06:55:03] (step=0632900) Train Loss: 0.1501, Train Steps/Sec: 11.16, Grad Norm: 0.0060 +[2025-02-20 06:55:12] (step=0633000) Train Loss: 0.1499, Train Steps/Sec: 11.50, Grad Norm: 0.0060 +[2025-02-20 06:55:22] (step=0633100) Train Loss: 0.1510, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 06:55:30] (step=0633200) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 06:55:39] (step=0633300) Train Loss: 0.1497, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 06:55:49] (step=0633400) Train Loss: 0.1499, Train Steps/Sec: 10.88, Grad Norm: 0.0060 +[2025-02-20 06:55:58] (step=0633500) Train Loss: 0.1511, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 06:56:07] (step=0633600) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 06:56:15] (step=0633700) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:56:24] (step=0633800) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:56:33] (step=0633900) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:56:41] (step=0634000) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:56:50] (step=0634100) Train Loss: 0.1504, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 06:56:59] (step=0634200) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:57:09] (step=0634300) Train Loss: 0.1494, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-20 06:57:17] (step=0634400) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 06:57:26] (step=0634500) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0057 +[2025-02-20 06:57:35] (step=0634600) Train Loss: 0.1480, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 06:57:43] (step=0634700) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:57:52] (step=0634800) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 06:58:00] (step=0634900) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:58:09] (step=0635000) Train Loss: 0.1490, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 06:58:18] (step=0635100) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 06:58:27] (step=0635200) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:58:35] (step=0635300) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:58:44] (step=0635400) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 06:58:53] (step=0635500) Train Loss: 0.1485, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 06:59:04] (step=0635600) Train Loss: 0.1503, Train Steps/Sec: 9.21, Grad Norm: 0.0062 +[2025-02-20 06:59:13] (step=0635700) Train Loss: 0.1511, Train Steps/Sec: 10.89, Grad Norm: 0.0062 +[2025-02-20 06:59:21] (step=0635800) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 06:59:30] (step=0635900) Train Loss: 0.1518, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 06:59:39] (step=0636000) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 06:59:47] (step=0636100) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 06:59:56] (step=0636200) Train Loss: 0.1498, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 07:00:05] (step=0636300) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:00:13] (step=0636400) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 07:00:22] (step=0636500) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:00:31] (step=0636600) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 07:00:39] (step=0636700) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 07:00:49] (step=0636800) Train Loss: 0.1497, Train Steps/Sec: 9.99, Grad Norm: 0.0058 +[2025-02-20 07:00:58] (step=0636900) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:01:06] (step=0637000) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 07:01:15] (step=0637100) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:01:24] (step=0637200) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 07:01:32] (step=0637300) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:01:41] (step=0637400) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 07:01:50] (step=0637500) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 07:01:58] (step=0637600) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:02:07] (step=0637700) Train Loss: 0.1511, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 07:02:17] (step=0637800) Train Loss: 0.1506, Train Steps/Sec: 10.56, Grad Norm: 0.0061 +[2025-02-20 07:02:26] (step=0637900) Train Loss: 0.1490, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 07:02:34] (step=0638000) Train Loss: 0.1478, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 07:02:44] (step=0638100) Train Loss: 0.1500, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-20 07:02:53] (step=0638200) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 07:03:02] (step=0638300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 07:03:11] (step=0638400) Train Loss: 0.1501, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 07:03:19] (step=0638500) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:03:28] (step=0638600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 07:03:36] (step=0638700) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:03:45] (step=0638800) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 07:03:54] (step=0638900) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 07:04:02] (step=0639000) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 07:04:11] (step=0639100) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 07:04:19] (step=0639200) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 07:04:29] (step=0639300) Train Loss: 0.1504, Train Steps/Sec: 10.00, Grad Norm: 0.0064 +[2025-02-20 07:04:38] (step=0639400) Train Loss: 0.1511, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-20 07:04:47] (step=0639500) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 07:04:56] (step=0639600) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 07:05:04] (step=0639700) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 07:05:13] (step=0639800) Train Loss: 0.1516, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 07:05:22] (step=0639900) Train Loss: 0.1508, Train Steps/Sec: 10.86, Grad Norm: 0.0064 +[2025-02-20 07:05:31] (step=0640000) Train Loss: 0.1503, Train Steps/Sec: 10.77, Grad Norm: 0.0062 +[2025-02-20 07:05:32] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0640000.pt +[2025-02-20 07:05:41] (step=0640100) Train Loss: 0.1503, Train Steps/Sec: 9.91, Grad Norm: 0.0060 +[2025-02-20 07:05:50] (step=0640200) Train Loss: 0.1492, Train Steps/Sec: 11.48, Grad Norm: 0.0061 +[2025-02-20 07:05:59] (step=0640300) Train Loss: 0.1498, Train Steps/Sec: 11.50, Grad Norm: 0.0062 +[2025-02-20 07:06:08] (step=0640400) Train Loss: 0.1504, Train Steps/Sec: 11.49, Grad Norm: 0.0061 +[2025-02-20 07:06:16] (step=0640500) Train Loss: 0.1503, Train Steps/Sec: 11.45, Grad Norm: 0.0061 +[2025-02-20 07:06:27] (step=0640600) Train Loss: 0.1513, Train Steps/Sec: 9.64, Grad Norm: 0.0059 +[2025-02-20 07:06:35] (step=0640700) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 07:06:44] (step=0640800) Train Loss: 0.1486, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 07:06:53] (step=0640900) Train Loss: 0.1494, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 07:07:01] (step=0641000) Train Loss: 0.1491, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 07:07:10] (step=0641100) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 07:07:19] (step=0641200) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 07:07:27] (step=0641300) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0065 +[2025-02-20 07:07:36] (step=0641400) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 07:07:45] (step=0641500) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 07:07:53] (step=0641600) Train Loss: 0.1500, Train Steps/Sec: 11.17, Grad Norm: 0.0061 +[2025-02-20 07:08:02] (step=0641700) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0057 +[2025-02-20 07:08:12] (step=0641800) Train Loss: 0.1508, Train Steps/Sec: 9.76, Grad Norm: 0.0062 +[2025-02-20 07:08:21] (step=0641900) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 07:08:30] (step=0642000) Train Loss: 0.1505, Train Steps/Sec: 11.19, Grad Norm: 0.0060 +[2025-02-20 07:08:39] (step=0642100) Train Loss: 0.1498, Train Steps/Sec: 10.82, Grad Norm: 0.0061 +[2025-02-20 07:08:49] (step=0642200) Train Loss: 0.1491, Train Steps/Sec: 10.52, Grad Norm: 0.0061 +[2025-02-20 07:08:57] (step=0642300) Train Loss: 0.1498, Train Steps/Sec: 11.51, Grad Norm: 0.0064 +[2025-02-20 07:09:06] (step=0642400) Train Loss: 0.1508, Train Steps/Sec: 11.51, Grad Norm: 0.0063 +[2025-02-20 07:09:15] (step=0642500) Train Loss: 0.1483, Train Steps/Sec: 11.51, Grad Norm: 0.0061 +[2025-02-20 07:09:23] (step=0642600) Train Loss: 0.1492, Train Steps/Sec: 11.52, Grad Norm: 0.0061 +[2025-02-20 07:09:32] (step=0642700) Train Loss: 0.1502, Train Steps/Sec: 11.53, Grad Norm: 0.0062 +[2025-02-20 07:09:41] (step=0642800) Train Loss: 0.1502, Train Steps/Sec: 11.12, Grad Norm: 0.0060 +[2025-02-20 07:09:50] (step=0642900) Train Loss: 0.1500, Train Steps/Sec: 11.52, Grad Norm: 0.0059 +[2025-02-20 07:09:59] (step=0643000) Train Loss: 0.1490, Train Steps/Sec: 11.48, Grad Norm: 0.0063 +[2025-02-20 07:10:09] (step=0643100) Train Loss: 0.1492, Train Steps/Sec: 9.93, Grad Norm: 0.0061 +[2025-02-20 07:10:17] (step=0643200) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 07:10:26] (step=0643300) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 07:10:34] (step=0643400) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 07:10:43] (step=0643500) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 07:10:52] (step=0643600) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 07:11:01] (step=0643700) Train Loss: 0.1483, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 07:11:09] (step=0643800) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:11:18] (step=0643900) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 07:11:26] (step=0644000) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 07:11:35] (step=0644100) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 07:11:44] (step=0644200) Train Loss: 0.1515, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 07:11:55] (step=0644300) Train Loss: 0.1494, Train Steps/Sec: 9.23, Grad Norm: 0.0061 +[2025-02-20 07:12:04] (step=0644400) Train Loss: 0.1504, Train Steps/Sec: 10.87, Grad Norm: 0.0062 +[2025-02-20 07:12:13] (step=0644500) Train Loss: 0.1512, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 07:12:21] (step=0644600) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 07:12:30] (step=0644700) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 07:12:39] (step=0644800) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 07:12:47] (step=0644900) Train Loss: 0.1494, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-20 07:12:56] (step=0645000) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 07:13:05] (step=0645100) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 07:13:13] (step=0645200) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 07:13:22] (step=0645300) Train Loss: 0.1512, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 07:13:31] (step=0645400) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 07:13:39] (step=0645500) Train Loss: 0.1494, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 07:13:49] (step=0645600) Train Loss: 0.1500, Train Steps/Sec: 9.95, Grad Norm: 0.0060 +[2025-02-20 07:13:58] (step=0645700) Train Loss: 0.1507, Train Steps/Sec: 11.53, Grad Norm: 0.0059 +[2025-02-20 07:14:07] (step=0645800) Train Loss: 0.1514, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 07:14:16] (step=0645900) Train Loss: 0.1499, Train Steps/Sec: 11.15, Grad Norm: 0.0062 +[2025-02-20 07:14:24] (step=0646000) Train Loss: 0.1498, Train Steps/Sec: 11.52, Grad Norm: 0.0061 +[2025-02-20 07:14:33] (step=0646100) Train Loss: 0.1492, Train Steps/Sec: 11.53, Grad Norm: 0.0060 +[2025-02-20 07:14:42] (step=0646200) Train Loss: 0.1502, Train Steps/Sec: 11.53, Grad Norm: 0.0061 +[2025-02-20 07:14:50] (step=0646300) Train Loss: 0.1508, Train Steps/Sec: 11.53, Grad Norm: 0.0060 +[2025-02-20 07:14:59] (step=0646400) Train Loss: 0.1502, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 07:15:09] (step=0646500) Train Loss: 0.1510, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-20 07:15:18] (step=0646600) Train Loss: 0.1505, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 07:15:26] (step=0646700) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 07:15:36] (step=0646800) Train Loss: 0.1511, Train Steps/Sec: 10.00, Grad Norm: 0.0061 +[2025-02-20 07:15:45] (step=0646900) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 07:15:54] (step=0647000) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 07:16:03] (step=0647100) Train Loss: 0.1485, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-20 07:16:11] (step=0647200) Train Loss: 0.1505, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 07:16:20] (step=0647300) Train Loss: 0.1486, Train Steps/Sec: 11.57, Grad Norm: 0.0058 +[2025-02-20 07:16:29] (step=0647400) Train Loss: 0.1513, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 07:16:37] (step=0647500) Train Loss: 0.1509, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 07:16:46] (step=0647600) Train Loss: 0.1490, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 07:16:55] (step=0647700) Train Loss: 0.1493, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 07:17:03] (step=0647800) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 07:17:12] (step=0647900) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 07:17:21] (step=0648000) Train Loss: 0.1483, Train Steps/Sec: 11.53, Grad Norm: 0.0060 +[2025-02-20 07:17:31] (step=0648100) Train Loss: 0.1493, Train Steps/Sec: 9.68, Grad Norm: 0.0060 +[2025-02-20 07:17:40] (step=0648200) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 07:17:48] (step=0648300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:17:57] (step=0648400) Train Loss: 0.1485, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 07:18:06] (step=0648500) Train Loss: 0.1482, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 07:18:15] (step=0648600) Train Loss: 0.1509, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 07:18:24] (step=0648700) Train Loss: 0.1495, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-20 07:18:33] (step=0648800) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 07:18:41] (step=0648900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 07:18:50] (step=0649000) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:18:59] (step=0649100) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:19:07] (step=0649200) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 07:19:18] (step=0649300) Train Loss: 0.1498, Train Steps/Sec: 9.69, Grad Norm: 0.0061 +[2025-02-20 07:19:26] (step=0649400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:19:35] (step=0649500) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:19:43] (step=0649600) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 07:19:52] (step=0649700) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:20:01] (step=0649800) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:20:09] (step=0649900) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:20:18] (step=0650000) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 07:20:26] (step=0650100) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 07:20:35] (step=0650200) Train Loss: 0.1505, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-20 07:20:44] (step=0650300) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:20:52] (step=0650400) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:21:01] (step=0650500) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 07:21:11] (step=0650600) Train Loss: 0.1496, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 07:21:20] (step=0650700) Train Loss: 0.1512, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 07:21:29] (step=0650800) Train Loss: 0.1499, Train Steps/Sec: 10.61, Grad Norm: 0.0062 +[2025-02-20 07:21:38] (step=0650900) Train Loss: 0.1492, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 07:21:47] (step=0651000) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 07:21:56] (step=0651100) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 07:22:04] (step=0651200) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 07:22:13] (step=0651300) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 07:22:22] (step=0651400) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 07:22:30] (step=0651500) Train Loss: 0.1495, Train Steps/Sec: 11.23, Grad Norm: 0.0058 +[2025-02-20 07:22:39] (step=0651600) Train Loss: 0.1477, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 07:22:48] (step=0651700) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 07:22:58] (step=0651800) Train Loss: 0.1496, Train Steps/Sec: 9.93, Grad Norm: 0.0065 +[2025-02-20 07:23:06] (step=0651900) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:23:15] (step=0652000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0057 +[2025-02-20 07:23:24] (step=0652100) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:23:32] (step=0652200) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:23:41] (step=0652300) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:23:50] (step=0652400) Train Loss: 0.1481, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 07:23:58] (step=0652500) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:24:07] (step=0652600) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 07:24:16] (step=0652700) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:24:24] (step=0652800) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:24:33] (step=0652900) Train Loss: 0.1503, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 07:24:42] (step=0653000) Train Loss: 0.1485, Train Steps/Sec: 10.57, Grad Norm: 0.0063 +[2025-02-20 07:24:53] (step=0653100) Train Loss: 0.1508, Train Steps/Sec: 9.43, Grad Norm: 0.0059 +[2025-02-20 07:25:02] (step=0653200) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 07:25:10] (step=0653300) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 07:25:19] (step=0653400) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 07:25:28] (step=0653500) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:25:36] (step=0653600) Train Loss: 0.1508, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 07:25:45] (step=0653700) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:25:54] (step=0653800) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:26:02] (step=0653900) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 07:26:11] (step=0654000) Train Loss: 0.1484, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:26:19] (step=0654100) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 07:26:28] (step=0654200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:26:38] (step=0654300) Train Loss: 0.1500, Train Steps/Sec: 9.92, Grad Norm: 0.0058 +[2025-02-20 07:26:47] (step=0654400) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 07:26:55] (step=0654500) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 07:27:04] (step=0654600) Train Loss: 0.1484, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 07:27:13] (step=0654700) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:27:22] (step=0654800) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:27:30] (step=0654900) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:27:39] (step=0655000) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:27:48] (step=0655100) Train Loss: 0.1516, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-20 07:27:57] (step=0655200) Train Loss: 0.1492, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 07:28:06] (step=0655300) Train Loss: 0.1515, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 07:28:15] (step=0655400) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:28:23] (step=0655500) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 07:28:33] (step=0655600) Train Loss: 0.1489, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-20 07:28:42] (step=0655700) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:28:51] (step=0655800) Train Loss: 0.1493, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 07:28:59] (step=0655900) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 07:29:08] (step=0656000) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 07:29:17] (step=0656100) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:29:25] (step=0656200) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:29:34] (step=0656300) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:29:42] (step=0656400) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:29:51] (step=0656500) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:30:00] (step=0656600) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:30:08] (step=0656700) Train Loss: 0.1494, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 07:30:18] (step=0656800) Train Loss: 0.1506, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 07:30:27] (step=0656900) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:30:36] (step=0657000) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:30:44] (step=0657100) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 07:30:53] (step=0657200) Train Loss: 0.1513, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 07:31:02] (step=0657300) Train Loss: 0.1490, Train Steps/Sec: 10.93, Grad Norm: 0.0065 +[2025-02-20 07:31:12] (step=0657400) Train Loss: 0.1505, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-20 07:31:20] (step=0657500) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 07:31:29] (step=0657600) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:31:37] (step=0657700) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:31:46] (step=0657800) Train Loss: 0.1480, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:31:55] (step=0657900) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 07:32:04] (step=0658000) Train Loss: 0.1508, Train Steps/Sec: 11.20, Grad Norm: 0.0060 +[2025-02-20 07:32:14] (step=0658100) Train Loss: 0.1497, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 07:32:22] (step=0658200) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:32:31] (step=0658300) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:32:39] (step=0658400) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 07:32:48] (step=0658500) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:32:57] (step=0658600) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:33:05] (step=0658700) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 07:33:14] (step=0658800) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:33:23] (step=0658900) Train Loss: 0.1504, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 07:33:31] (step=0659000) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:33:40] (step=0659100) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:33:48] (step=0659200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:33:58] (step=0659300) Train Loss: 0.1487, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 07:34:07] (step=0659400) Train Loss: 0.1502, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-20 07:34:17] (step=0659500) Train Loss: 0.1507, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-20 07:34:26] (step=0659600) Train Loss: 0.1506, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 07:34:34] (step=0659700) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:34:43] (step=0659800) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:34:52] (step=0659900) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:35:00] (step=0660000) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 07:35:01] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0660000.pt +[2025-02-20 07:35:09] (step=0660100) Train Loss: 0.1514, Train Steps/Sec: 10.81, Grad Norm: 0.0059 +[2025-02-20 07:35:18] (step=0660200) Train Loss: 0.1496, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 07:35:27] (step=0660300) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:35:36] (step=0660400) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:35:44] (step=0660500) Train Loss: 0.1499, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 07:35:54] (step=0660600) Train Loss: 0.1486, Train Steps/Sec: 10.08, Grad Norm: 0.0059 +[2025-02-20 07:36:03] (step=0660700) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:36:11] (step=0660800) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:36:20] (step=0660900) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:36:29] (step=0661000) Train Loss: 0.1505, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-20 07:36:37] (step=0661100) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:36:46] (step=0661200) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:36:55] (step=0661300) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:37:03] (step=0661400) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:37:12] (step=0661500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:37:21] (step=0661600) Train Loss: 0.1500, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 07:37:30] (step=0661700) Train Loss: 0.1500, Train Steps/Sec: 10.63, Grad Norm: 0.0063 +[2025-02-20 07:37:41] (step=0661800) Train Loss: 0.1491, Train Steps/Sec: 9.45, Grad Norm: 0.0063 +[2025-02-20 07:37:49] (step=0661900) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:37:58] (step=0662000) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 07:38:06] (step=0662100) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 07:38:15] (step=0662200) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:38:24] (step=0662300) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 07:38:32] (step=0662400) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:38:41] (step=0662500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:38:50] (step=0662600) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:38:58] (step=0662700) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:39:07] (step=0662800) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 07:39:15] (step=0662900) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:39:24] (step=0663000) Train Loss: 0.1508, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 07:39:34] (step=0663100) Train Loss: 0.1529, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-20 07:39:43] (step=0663200) Train Loss: 0.1502, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 07:39:52] (step=0663300) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:40:00] (step=0663400) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:40:09] (step=0663500) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:40:17] (step=0663600) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:40:26] (step=0663700) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:40:35] (step=0663800) Train Loss: 0.1501, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-20 07:40:44] (step=0663900) Train Loss: 0.1494, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-20 07:40:53] (step=0664000) Train Loss: 0.1495, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 07:41:02] (step=0664100) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:41:11] (step=0664200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:41:21] (step=0664300) Train Loss: 0.1491, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 07:41:29] (step=0664400) Train Loss: 0.1489, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:41:38] (step=0664500) Train Loss: 0.1499, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 07:41:47] (step=0664600) Train Loss: 0.1487, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 07:41:55] (step=0664700) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 07:42:04] (step=0664800) Train Loss: 0.1492, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 07:42:12] (step=0664900) Train Loss: 0.1520, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 07:42:21] (step=0665000) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 07:42:29] (step=0665100) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:42:38] (step=0665200) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:42:47] (step=0665300) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 07:42:56] (step=0665400) Train Loss: 0.1493, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 07:43:04] (step=0665500) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 07:43:14] (step=0665600) Train Loss: 0.1510, Train Steps/Sec: 10.06, Grad Norm: 0.0063 +[2025-02-20 07:43:23] (step=0665700) Train Loss: 0.1485, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 07:43:31] (step=0665800) Train Loss: 0.1488, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 07:43:40] (step=0665900) Train Loss: 0.1492, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-20 07:43:50] (step=0666000) Train Loss: 0.1490, Train Steps/Sec: 10.65, Grad Norm: 0.0058 +[2025-02-20 07:43:59] (step=0666100) Train Loss: 0.1498, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 07:44:07] (step=0666200) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 07:44:16] (step=0666300) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 07:44:24] (step=0666400) Train Loss: 0.1503, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 07:44:33] (step=0666500) Train Loss: 0.1491, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 07:44:42] (step=0666600) Train Loss: 0.1505, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 07:44:50] (step=0666700) Train Loss: 0.1500, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 07:45:00] (step=0666800) Train Loss: 0.1498, Train Steps/Sec: 10.02, Grad Norm: 0.0065 +[2025-02-20 07:45:09] (step=0666900) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:45:18] (step=0667000) Train Loss: 0.1515, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 07:45:26] (step=0667100) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:45:35] (step=0667200) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 07:45:43] (step=0667300) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 07:45:52] (step=0667400) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 07:46:00] (step=0667500) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 07:46:09] (step=0667600) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 07:46:18] (step=0667700) Train Loss: 0.1498, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 07:46:26] (step=0667800) Train Loss: 0.1496, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-20 07:46:35] (step=0667900) Train Loss: 0.1489, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 07:46:44] (step=0668000) Train Loss: 0.1489, Train Steps/Sec: 11.67, Grad Norm: 0.0065 +[2025-02-20 07:46:54] (step=0668100) Train Loss: 0.1491, Train Steps/Sec: 9.74, Grad Norm: 0.0060 +[2025-02-20 07:47:03] (step=0668200) Train Loss: 0.1507, Train Steps/Sec: 10.64, Grad Norm: 0.0061 +[2025-02-20 07:47:12] (step=0668300) Train Loss: 0.1501, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 07:47:21] (step=0668400) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:47:30] (step=0668500) Train Loss: 0.1485, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:47:38] (step=0668600) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:47:47] (step=0668700) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:47:55] (step=0668800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 07:48:04] (step=0668900) Train Loss: 0.1509, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 07:48:13] (step=0669000) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:48:22] (step=0669100) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:48:30] (step=0669200) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:48:40] (step=0669300) Train Loss: 0.1493, Train Steps/Sec: 9.90, Grad Norm: 0.0061 +[2025-02-20 07:48:49] (step=0669400) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 07:48:57] (step=0669500) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 07:49:06] (step=0669600) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 07:49:15] (step=0669700) Train Loss: 0.1505, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 07:49:24] (step=0669800) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 07:49:32] (step=0669900) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:49:41] (step=0670000) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 07:49:49] (step=0670100) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:49:58] (step=0670200) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 07:50:07] (step=0670300) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 07:50:16] (step=0670400) Train Loss: 0.1491, Train Steps/Sec: 10.61, Grad Norm: 0.0063 +[2025-02-20 07:50:25] (step=0670500) Train Loss: 0.1503, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 07:50:36] (step=0670600) Train Loss: 0.1498, Train Steps/Sec: 9.92, Grad Norm: 0.0060 +[2025-02-20 07:50:44] (step=0670700) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:50:53] (step=0670800) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:51:01] (step=0670900) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 07:51:10] (step=0671000) Train Loss: 0.1501, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 07:51:19] (step=0671100) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:51:27] (step=0671200) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:51:36] (step=0671300) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:51:45] (step=0671400) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:51:53] (step=0671500) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 07:52:02] (step=0671600) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:52:10] (step=0671700) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 07:52:20] (step=0671800) Train Loss: 0.1496, Train Steps/Sec: 10.00, Grad Norm: 0.0063 +[2025-02-20 07:52:29] (step=0671900) Train Loss: 0.1497, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 07:52:38] (step=0672000) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 07:52:46] (step=0672100) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 07:52:55] (step=0672200) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 07:53:04] (step=0672300) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 07:53:12] (step=0672400) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 07:53:22] (step=0672500) Train Loss: 0.1502, Train Steps/Sec: 10.67, Grad Norm: 0.0062 +[2025-02-20 07:53:31] (step=0672600) Train Loss: 0.1508, Train Steps/Sec: 10.61, Grad Norm: 0.0066 +[2025-02-20 07:53:40] (step=0672700) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 07:53:48] (step=0672800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 07:53:57] (step=0672900) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 07:54:06] (step=0673000) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 07:54:16] (step=0673100) Train Loss: 0.1495, Train Steps/Sec: 9.94, Grad Norm: 0.0060 +[2025-02-20 07:54:24] (step=0673200) Train Loss: 0.1488, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 07:54:33] (step=0673300) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 07:54:42] (step=0673400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:54:50] (step=0673500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 07:54:59] (step=0673600) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 07:55:07] (step=0673700) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 07:55:16] (step=0673800) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 07:55:25] (step=0673900) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:55:33] (step=0674000) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 07:55:42] (step=0674100) Train Loss: 0.1498, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 07:55:51] (step=0674200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 07:56:01] (step=0674300) Train Loss: 0.1496, Train Steps/Sec: 10.00, Grad Norm: 0.0061 +[2025-02-20 07:56:09] (step=0674400) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:56:18] (step=0674500) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 07:56:27] (step=0674600) Train Loss: 0.1508, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 07:56:36] (step=0674700) Train Loss: 0.1501, Train Steps/Sec: 10.63, Grad Norm: 0.0060 +[2025-02-20 07:56:45] (step=0674800) Train Loss: 0.1500, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-20 07:56:54] (step=0674900) Train Loss: 0.1485, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 07:57:03] (step=0675000) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 07:57:11] (step=0675100) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 07:57:20] (step=0675200) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 07:57:28] (step=0675300) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 07:57:37] (step=0675400) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 07:57:46] (step=0675500) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 07:57:56] (step=0675600) Train Loss: 0.1490, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-20 07:58:05] (step=0675700) Train Loss: 0.1507, Train Steps/Sec: 11.48, Grad Norm: 0.0062 +[2025-02-20 07:58:13] (step=0675800) Train Loss: 0.1491, Train Steps/Sec: 11.48, Grad Norm: 0.0061 +[2025-02-20 07:58:22] (step=0675900) Train Loss: 0.1497, Train Steps/Sec: 11.49, Grad Norm: 0.0060 +[2025-02-20 07:58:31] (step=0676000) Train Loss: 0.1512, Train Steps/Sec: 11.50, Grad Norm: 0.0060 +[2025-02-20 07:58:39] (step=0676100) Train Loss: 0.1506, Train Steps/Sec: 11.50, Grad Norm: 0.0063 +[2025-02-20 07:58:48] (step=0676200) Train Loss: 0.1512, Train Steps/Sec: 11.11, Grad Norm: 0.0061 +[2025-02-20 07:58:57] (step=0676300) Train Loss: 0.1506, Train Steps/Sec: 11.49, Grad Norm: 0.0062 +[2025-02-20 07:59:06] (step=0676400) Train Loss: 0.1501, Train Steps/Sec: 11.49, Grad Norm: 0.0058 +[2025-02-20 07:59:15] (step=0676500) Train Loss: 0.1499, Train Steps/Sec: 11.49, Grad Norm: 0.0061 +[2025-02-20 07:59:23] (step=0676600) Train Loss: 0.1507, Train Steps/Sec: 11.49, Grad Norm: 0.0060 +[2025-02-20 07:59:32] (step=0676700) Train Loss: 0.1509, Train Steps/Sec: 11.46, Grad Norm: 0.0060 +[2025-02-20 07:59:42] (step=0676800) Train Loss: 0.1502, Train Steps/Sec: 9.67, Grad Norm: 0.0061 +[2025-02-20 07:59:52] (step=0676900) Train Loss: 0.1483, Train Steps/Sec: 10.55, Grad Norm: 0.0058 +[2025-02-20 08:00:01] (step=0677000) Train Loss: 0.1503, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 08:00:10] (step=0677100) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:00:18] (step=0677200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:00:27] (step=0677300) Train Loss: 0.1495, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 08:00:35] (step=0677400) Train Loss: 0.1511, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 08:00:44] (step=0677500) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 08:00:53] (step=0677600) Train Loss: 0.1503, Train Steps/Sec: 11.21, Grad Norm: 0.0058 +[2025-02-20 08:01:02] (step=0677700) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 08:01:10] (step=0677800) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 08:01:19] (step=0677900) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 08:01:28] (step=0678000) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 08:01:38] (step=0678100) Train Loss: 0.1512, Train Steps/Sec: 9.77, Grad Norm: 0.0061 +[2025-02-20 08:01:46] (step=0678200) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:01:55] (step=0678300) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:02:04] (step=0678400) Train Loss: 0.1476, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 08:02:12] (step=0678500) Train Loss: 0.1527, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 08:02:21] (step=0678600) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:02:30] (step=0678700) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 08:02:38] (step=0678800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:02:47] (step=0678900) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:02:56] (step=0679000) Train Loss: 0.1507, Train Steps/Sec: 10.97, Grad Norm: 0.0060 +[2025-02-20 08:03:05] (step=0679100) Train Loss: 0.1505, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-20 08:03:14] (step=0679200) Train Loss: 0.1502, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 08:03:24] (step=0679300) Train Loss: 0.1492, Train Steps/Sec: 9.99, Grad Norm: 0.0061 +[2025-02-20 08:03:33] (step=0679400) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 08:03:42] (step=0679500) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:03:50] (step=0679600) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:03:59] (step=0679700) Train Loss: 0.1496, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 08:04:08] (step=0679800) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 08:04:16] (step=0679900) Train Loss: 0.1523, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:04:25] (step=0680000) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 08:04:26] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0680000.pt +[2025-02-20 08:04:34] (step=0680100) Train Loss: 0.1492, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-20 08:04:43] (step=0680200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:04:51] (step=0680300) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:05:00] (step=0680400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:05:09] (step=0680500) Train Loss: 0.1504, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 08:05:19] (step=0680600) Train Loss: 0.1511, Train Steps/Sec: 10.05, Grad Norm: 0.0062 +[2025-02-20 08:05:27] (step=0680700) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 08:05:36] (step=0680800) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 08:05:45] (step=0680900) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 08:05:53] (step=0681000) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 08:06:02] (step=0681100) Train Loss: 0.1494, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 08:06:12] (step=0681200) Train Loss: 0.1501, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-20 08:06:21] (step=0681300) Train Loss: 0.1502, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 08:06:29] (step=0681400) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 08:06:38] (step=0681500) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 08:06:47] (step=0681600) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 08:06:55] (step=0681700) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 08:07:05] (step=0681800) Train Loss: 0.1500, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 08:07:14] (step=0681900) Train Loss: 0.1498, Train Steps/Sec: 11.25, Grad Norm: 0.0064 +[2025-02-20 08:07:23] (step=0682000) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:07:31] (step=0682100) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 08:07:40] (step=0682200) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:07:49] (step=0682300) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:07:57] (step=0682400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:08:06] (step=0682500) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:08:14] (step=0682600) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:08:23] (step=0682700) Train Loss: 0.1498, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 08:08:32] (step=0682800) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 08:08:40] (step=0682900) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:08:49] (step=0683000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:08:59] (step=0683100) Train Loss: 0.1503, Train Steps/Sec: 9.96, Grad Norm: 0.0062 +[2025-02-20 08:09:08] (step=0683200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:09:17] (step=0683300) Train Loss: 0.1493, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-20 08:09:26] (step=0683400) Train Loss: 0.1509, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-20 08:09:35] (step=0683500) Train Loss: 0.1498, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 08:09:44] (step=0683600) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 08:09:52] (step=0683700) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:10:01] (step=0683800) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:10:10] (step=0683900) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:10:18] (step=0684000) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:10:27] (step=0684100) Train Loss: 0.1494, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 08:10:36] (step=0684200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:10:46] (step=0684300) Train Loss: 0.1498, Train Steps/Sec: 10.05, Grad Norm: 0.0062 +[2025-02-20 08:10:54] (step=0684400) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:11:03] (step=0684500) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:11:11] (step=0684600) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:11:20] (step=0684700) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:11:29] (step=0684800) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:11:37] (step=0684900) Train Loss: 0.1503, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 08:11:46] (step=0685000) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 08:11:55] (step=0685100) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:12:03] (step=0685200) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:12:12] (step=0685300) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:12:20] (step=0685400) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:12:29] (step=0685500) Train Loss: 0.1495, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-20 08:12:40] (step=0685600) Train Loss: 0.1497, Train Steps/Sec: 9.25, Grad Norm: 0.0062 +[2025-02-20 08:12:49] (step=0685700) Train Loss: 0.1502, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 08:12:58] (step=0685800) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:13:07] (step=0685900) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 08:13:15] (step=0686000) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:13:24] (step=0686100) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:13:32] (step=0686200) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:13:41] (step=0686300) Train Loss: 0.1507, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 08:13:50] (step=0686400) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:13:58] (step=0686500) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:14:07] (step=0686600) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:14:16] (step=0686700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:14:26] (step=0686800) Train Loss: 0.1503, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 08:14:34] (step=0686900) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:14:43] (step=0687000) Train Loss: 0.1492, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 08:14:52] (step=0687100) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:15:00] (step=0687200) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:15:09] (step=0687300) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:15:18] (step=0687400) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 08:15:26] (step=0687500) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:15:35] (step=0687600) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 08:15:44] (step=0687700) Train Loss: 0.1498, Train Steps/Sec: 10.31, Grad Norm: 0.0062 +[2025-02-20 08:15:53] (step=0687800) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:16:02] (step=0687900) Train Loss: 0.1493, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 08:16:11] (step=0688000) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:16:21] (step=0688100) Train Loss: 0.1512, Train Steps/Sec: 9.99, Grad Norm: 0.0059 +[2025-02-20 08:16:29] (step=0688200) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 08:16:38] (step=0688300) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 08:16:47] (step=0688400) Train Loss: 0.1512, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 08:16:56] (step=0688500) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:17:04] (step=0688600) Train Loss: 0.1479, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 08:17:13] (step=0688700) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:17:21] (step=0688800) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 08:17:30] (step=0688900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:17:39] (step=0689000) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:17:47] (step=0689100) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:17:56] (step=0689200) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 08:18:05] (step=0689300) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 08:18:15] (step=0689400) Train Loss: 0.1511, Train Steps/Sec: 10.11, Grad Norm: 0.0062 +[2025-02-20 08:18:23] (step=0689500) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:18:32] (step=0689600) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:18:40] (step=0689700) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 08:18:49] (step=0689800) Train Loss: 0.1498, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 08:18:59] (step=0689900) Train Loss: 0.1504, Train Steps/Sec: 10.60, Grad Norm: 0.0063 +[2025-02-20 08:19:08] (step=0690000) Train Loss: 0.1501, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 08:19:16] (step=0690100) Train Loss: 0.1488, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:19:25] (step=0690200) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:19:34] (step=0690300) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 08:19:42] (step=0690400) Train Loss: 0.1514, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 08:19:51] (step=0690500) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 08:20:01] (step=0690600) Train Loss: 0.1507, Train Steps/Sec: 9.71, Grad Norm: 0.0060 +[2025-02-20 08:20:10] (step=0690700) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:20:18] (step=0690800) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:20:27] (step=0690900) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:20:36] (step=0691000) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 08:20:44] (step=0691100) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:20:53] (step=0691200) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:21:01] (step=0691300) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:21:10] (step=0691400) Train Loss: 0.1502, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-20 08:21:19] (step=0691500) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:21:27] (step=0691600) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:21:36] (step=0691700) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:21:45] (step=0691800) Train Loss: 0.1509, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 08:21:55] (step=0691900) Train Loss: 0.1501, Train Steps/Sec: 10.02, Grad Norm: 0.0059 +[2025-02-20 08:22:03] (step=0692000) Train Loss: 0.1498, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 08:22:13] (step=0692100) Train Loss: 0.1503, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-20 08:22:22] (step=0692200) Train Loss: 0.1499, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-20 08:22:31] (step=0692300) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 08:22:39] (step=0692400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:22:48] (step=0692500) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 08:22:57] (step=0692600) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 08:23:05] (step=0692700) Train Loss: 0.1486, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 08:23:14] (step=0692800) Train Loss: 0.1508, Train Steps/Sec: 11.16, Grad Norm: 0.0061 +[2025-02-20 08:23:23] (step=0692900) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 08:23:31] (step=0693000) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 08:23:41] (step=0693100) Train Loss: 0.1493, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-20 08:23:50] (step=0693200) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:23:59] (step=0693300) Train Loss: 0.1482, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:24:07] (step=0693400) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 08:24:16] (step=0693500) Train Loss: 0.1487, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 08:24:25] (step=0693600) Train Loss: 0.1518, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:24:33] (step=0693700) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:24:42] (step=0693800) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:24:50] (step=0693900) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:24:59] (step=0694000) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 08:25:08] (step=0694100) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:25:17] (step=0694200) Train Loss: 0.1494, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 08:25:26] (step=0694300) Train Loss: 0.1506, Train Steps/Sec: 10.54, Grad Norm: 0.0063 +[2025-02-20 08:25:37] (step=0694400) Train Loss: 0.1520, Train Steps/Sec: 9.47, Grad Norm: 0.0061 +[2025-02-20 08:25:45] (step=0694500) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:25:54] (step=0694600) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0067 +[2025-02-20 08:26:02] (step=0694700) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0067 +[2025-02-20 08:26:11] (step=0694800) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:26:20] (step=0694900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 08:26:29] (step=0695000) Train Loss: 0.1495, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 08:26:37] (step=0695100) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:26:46] (step=0695200) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:26:54] (step=0695300) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:27:03] (step=0695400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 08:27:12] (step=0695500) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:27:22] (step=0695600) Train Loss: 0.1512, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-20 08:27:30] (step=0695700) Train Loss: 0.1510, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 08:27:39] (step=0695800) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 08:27:48] (step=0695900) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 08:27:56] (step=0696000) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:28:05] (step=0696100) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 08:28:13] (step=0696200) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 08:28:22] (step=0696300) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 08:28:32] (step=0696400) Train Loss: 0.1490, Train Steps/Sec: 10.34, Grad Norm: 0.0062 +[2025-02-20 08:28:40] (step=0696500) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 08:28:49] (step=0696600) Train Loss: 0.1490, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 08:28:58] (step=0696700) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 08:29:07] (step=0696800) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:29:16] (step=0696900) Train Loss: 0.1496, Train Steps/Sec: 10.08, Grad Norm: 0.0062 +[2025-02-20 08:29:25] (step=0697000) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 08:29:34] (step=0697100) Train Loss: 0.1496, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 08:29:43] (step=0697200) Train Loss: 0.1510, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 08:29:51] (step=0697300) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 08:30:00] (step=0697400) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 08:30:09] (step=0697500) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 08:30:17] (step=0697600) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:30:26] (step=0697700) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:30:34] (step=0697800) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:30:43] (step=0697900) Train Loss: 0.1497, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 08:30:52] (step=0698000) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:31:02] (step=0698100) Train Loss: 0.1499, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-20 08:31:10] (step=0698200) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:31:19] (step=0698300) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:31:28] (step=0698400) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:31:36] (step=0698500) Train Loss: 0.1492, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 08:31:46] (step=0698600) Train Loss: 0.1515, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-20 08:31:55] (step=0698700) Train Loss: 0.1496, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 08:32:04] (step=0698800) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 08:32:12] (step=0698900) Train Loss: 0.1481, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:32:21] (step=0699000) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:32:29] (step=0699100) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 08:32:38] (step=0699200) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:32:47] (step=0699300) Train Loss: 0.1495, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-20 08:32:57] (step=0699400) Train Loss: 0.1509, Train Steps/Sec: 10.09, Grad Norm: 0.0062 +[2025-02-20 08:33:06] (step=0699500) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 08:33:14] (step=0699600) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 08:33:23] (step=0699700) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:33:31] (step=0699800) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 08:33:40] (step=0699900) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:33:49] (step=0700000) Train Loss: 0.1518, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 08:33:50] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0700000.pt +[2025-02-20 08:33:58] (step=0700100) Train Loss: 0.1498, Train Steps/Sec: 10.57, Grad Norm: 0.0063 +[2025-02-20 08:34:07] (step=0700200) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:34:16] (step=0700300) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:34:24] (step=0700400) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:34:33] (step=0700500) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:34:43] (step=0700600) Train Loss: 0.1493, Train Steps/Sec: 10.06, Grad Norm: 0.0062 +[2025-02-20 08:34:52] (step=0700700) Train Loss: 0.1489, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 08:35:01] (step=0700800) Train Loss: 0.1501, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 08:35:10] (step=0700900) Train Loss: 0.1495, Train Steps/Sec: 10.93, Grad Norm: 0.0065 +[2025-02-20 08:35:19] (step=0701000) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 08:35:27] (step=0701100) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 08:35:36] (step=0701200) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 08:35:44] (step=0701300) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 08:35:53] (step=0701400) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 08:36:02] (step=0701500) Train Loss: 0.1509, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 08:36:10] (step=0701600) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 08:36:19] (step=0701700) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 08:36:28] (step=0701800) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 08:36:38] (step=0701900) Train Loss: 0.1505, Train Steps/Sec: 10.07, Grad Norm: 0.0062 +[2025-02-20 08:36:46] (step=0702000) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:36:55] (step=0702100) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 08:37:04] (step=0702200) Train Loss: 0.1502, Train Steps/Sec: 11.28, Grad Norm: 0.0058 +[2025-02-20 08:37:12] (step=0702300) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:37:21] (step=0702400) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:37:29] (step=0702500) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:37:38] (step=0702600) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 08:37:47] (step=0702700) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 08:37:55] (step=0702800) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 08:38:04] (step=0702900) Train Loss: 0.1508, Train Steps/Sec: 10.98, Grad Norm: 0.0063 +[2025-02-20 08:38:14] (step=0703000) Train Loss: 0.1496, Train Steps/Sec: 10.91, Grad Norm: 0.0062 +[2025-02-20 08:38:24] (step=0703100) Train Loss: 0.1496, Train Steps/Sec: 9.50, Grad Norm: 0.0061 +[2025-02-20 08:38:33] (step=0703200) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 08:38:41] (step=0703300) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 08:38:50] (step=0703400) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 08:38:58] (step=0703500) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 08:39:07] (step=0703600) Train Loss: 0.1527, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 08:39:16] (step=0703700) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 08:39:24] (step=0703800) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:39:33] (step=0703900) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 08:39:42] (step=0704000) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-20 08:39:50] (step=0704100) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 08:39:59] (step=0704200) Train Loss: 0.1495, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 08:40:08] (step=0704300) Train Loss: 0.1502, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 08:40:18] (step=0704400) Train Loss: 0.1516, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 08:40:26] (step=0704500) Train Loss: 0.1487, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 08:40:35] (step=0704600) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 08:40:43] (step=0704700) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:40:52] (step=0704800) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 08:41:01] (step=0704900) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 08:41:09] (step=0705000) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 08:41:19] (step=0705100) Train Loss: 0.1508, Train Steps/Sec: 10.33, Grad Norm: 0.0061 +[2025-02-20 08:41:28] (step=0705200) Train Loss: 0.1492, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-20 08:41:37] (step=0705300) Train Loss: 0.1482, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 08:41:45] (step=0705400) Train Loss: 0.1525, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 08:41:54] (step=0705500) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 08:42:04] (step=0705600) Train Loss: 0.1493, Train Steps/Sec: 9.98, Grad Norm: 0.0059 +[2025-02-20 08:42:12] (step=0705700) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 08:42:21] (step=0705800) Train Loss: 0.1509, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 08:42:30] (step=0705900) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 08:42:38] (step=0706000) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0069 +[2025-02-20 08:42:47] (step=0706100) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:42:56] (step=0706200) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 08:43:04] (step=0706300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 08:43:13] (step=0706400) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 08:43:22] (step=0706500) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 08:43:30] (step=0706600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 08:43:39] (step=0706700) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 08:43:48] (step=0706800) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 08:43:57] (step=0706900) Train Loss: 0.1497, Train Steps/Sec: 10.03, Grad Norm: 0.0062 +[2025-02-20 08:44:06] (step=0707000) Train Loss: 0.1516, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:44:15] (step=0707100) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:44:24] (step=0707200) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 08:44:33] (step=0707300) Train Loss: 0.1500, Train Steps/Sec: 10.57, Grad Norm: 0.0063 +[2025-02-20 08:44:42] (step=0707400) Train Loss: 0.1500, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-20 08:44:51] (step=0707500) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 08:44:59] (step=0707600) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 08:45:08] (step=0707700) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 08:45:17] (step=0707800) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 08:45:25] (step=0707900) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:45:34] (step=0708000) Train Loss: 0.1489, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 08:45:44] (step=0708100) Train Loss: 0.1492, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 08:45:53] (step=0708200) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 08:46:01] (step=0708300) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:46:10] (step=0708400) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 08:46:19] (step=0708500) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:46:27] (step=0708600) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:46:36] (step=0708700) Train Loss: 0.1505, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-20 08:46:45] (step=0708800) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 08:46:53] (step=0708900) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 08:47:02] (step=0709000) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 08:47:10] (step=0709100) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 08:47:19] (step=0709200) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 08:47:28] (step=0709300) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 08:47:38] (step=0709400) Train Loss: 0.1497, Train Steps/Sec: 9.53, Grad Norm: 0.0061 +[2025-02-20 08:47:47] (step=0709500) Train Loss: 0.1500, Train Steps/Sec: 10.89, Grad Norm: 0.0060 +[2025-02-20 08:47:56] (step=0709600) Train Loss: 0.1501, Train Steps/Sec: 10.93, Grad Norm: 0.0058 +[2025-02-20 08:48:05] (step=0709700) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 08:48:14] (step=0709800) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 08:48:22] (step=0709900) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 08:48:31] (step=0710000) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:48:39] (step=0710100) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:48:48] (step=0710200) Train Loss: 0.1501, Train Steps/Sec: 11.24, Grad Norm: 0.0058 +[2025-02-20 08:48:57] (step=0710300) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:49:06] (step=0710400) Train Loss: 0.1482, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:49:14] (step=0710500) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 08:49:24] (step=0710600) Train Loss: 0.1498, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 08:49:33] (step=0710700) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 08:49:42] (step=0710800) Train Loss: 0.1497, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-20 08:49:50] (step=0710900) Train Loss: 0.1490, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 08:49:59] (step=0711000) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 08:50:08] (step=0711100) Train Loss: 0.1487, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 08:50:16] (step=0711200) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 08:50:25] (step=0711300) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 08:50:34] (step=0711400) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 08:50:42] (step=0711500) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 08:50:51] (step=0711600) Train Loss: 0.1491, Train Steps/Sec: 10.95, Grad Norm: 0.0064 +[2025-02-20 08:51:00] (step=0711700) Train Loss: 0.1506, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 08:51:10] (step=0711800) Train Loss: 0.1505, Train Steps/Sec: 10.89, Grad Norm: 0.0059 +[2025-02-20 08:51:20] (step=0711900) Train Loss: 0.1516, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 08:51:28] (step=0712000) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 08:51:37] (step=0712100) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 08:51:46] (step=0712200) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 08:51:54] (step=0712300) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 08:52:03] (step=0712400) Train Loss: 0.1486, Train Steps/Sec: 11.20, Grad Norm: 0.0059 +[2025-02-20 08:52:12] (step=0712500) Train Loss: 0.1499, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 08:52:20] (step=0712600) Train Loss: 0.1512, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 08:52:29] (step=0712700) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 08:52:38] (step=0712800) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 08:52:46] (step=0712900) Train Loss: 0.1494, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 08:52:55] (step=0713000) Train Loss: 0.1508, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 08:53:05] (step=0713100) Train Loss: 0.1509, Train Steps/Sec: 9.97, Grad Norm: 0.0062 +[2025-02-20 08:53:14] (step=0713200) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 08:53:22] (step=0713300) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 08:53:31] (step=0713400) Train Loss: 0.1480, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:53:40] (step=0713500) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 08:53:48] (step=0713600) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:53:57] (step=0713700) Train Loss: 0.1503, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 08:54:07] (step=0713800) Train Loss: 0.1516, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-20 08:54:16] (step=0713900) Train Loss: 0.1501, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 08:54:24] (step=0714000) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 08:54:33] (step=0714100) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 08:54:42] (step=0714200) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:54:50] (step=0714300) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 08:55:00] (step=0714400) Train Loss: 0.1486, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 08:55:09] (step=0714500) Train Loss: 0.1495, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 08:55:18] (step=0714600) Train Loss: 0.1514, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 08:55:26] (step=0714700) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 08:55:35] (step=0714800) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:55:44] (step=0714900) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 08:55:52] (step=0715000) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 08:56:01] (step=0715100) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 08:56:10] (step=0715200) Train Loss: 0.1520, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 08:56:18] (step=0715300) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:56:27] (step=0715400) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 08:56:36] (step=0715500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 08:56:46] (step=0715600) Train Loss: 0.1492, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 08:56:54] (step=0715700) Train Loss: 0.1494, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 08:57:03] (step=0715800) Train Loss: 0.1495, Train Steps/Sec: 11.53, Grad Norm: 0.0063 +[2025-02-20 08:57:12] (step=0715900) Train Loss: 0.1501, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 08:57:21] (step=0716000) Train Loss: 0.1506, Train Steps/Sec: 10.52, Grad Norm: 0.0059 +[2025-02-20 08:57:31] (step=0716100) Train Loss: 0.1494, Train Steps/Sec: 10.85, Grad Norm: 0.0060 +[2025-02-20 08:57:39] (step=0716200) Train Loss: 0.1488, Train Steps/Sec: 11.54, Grad Norm: 0.0062 +[2025-02-20 08:57:48] (step=0716300) Train Loss: 0.1507, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 08:57:57] (step=0716400) Train Loss: 0.1494, Train Steps/Sec: 11.55, Grad Norm: 0.0059 +[2025-02-20 08:58:05] (step=0716500) Train Loss: 0.1500, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 08:58:14] (step=0716600) Train Loss: 0.1510, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 08:58:23] (step=0716700) Train Loss: 0.1494, Train Steps/Sec: 11.18, Grad Norm: 0.0061 +[2025-02-20 08:58:31] (step=0716800) Train Loss: 0.1499, Train Steps/Sec: 11.51, Grad Norm: 0.0061 +[2025-02-20 08:58:41] (step=0716900) Train Loss: 0.1496, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-20 08:58:50] (step=0717000) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 08:58:59] (step=0717100) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 08:59:07] (step=0717200) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 08:59:16] (step=0717300) Train Loss: 0.1489, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 08:59:25] (step=0717400) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:59:33] (step=0717500) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 08:59:42] (step=0717600) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 08:59:51] (step=0717700) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 08:59:59] (step=0717800) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:00:08] (step=0717900) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 09:00:16] (step=0718000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 09:00:27] (step=0718100) Train Loss: 0.1492, Train Steps/Sec: 9.28, Grad Norm: 0.0062 +[2025-02-20 09:00:36] (step=0718200) Train Loss: 0.1490, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 09:00:45] (step=0718300) Train Loss: 0.1505, Train Steps/Sec: 10.91, Grad Norm: 0.0062 +[2025-02-20 09:00:54] (step=0718400) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:01:02] (step=0718500) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 09:01:11] (step=0718600) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:01:20] (step=0718700) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 09:01:28] (step=0718800) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 09:01:37] (step=0718900) Train Loss: 0.1493, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 09:01:46] (step=0719000) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 09:01:54] (step=0719100) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 09:02:03] (step=0719200) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 09:02:12] (step=0719300) Train Loss: 0.1496, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 09:02:22] (step=0719400) Train Loss: 0.1494, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 09:02:31] (step=0719500) Train Loss: 0.1501, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-20 09:02:39] (step=0719600) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:02:48] (step=0719700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:02:56] (step=0719800) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:03:05] (step=0719900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 09:03:14] (step=0720000) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:03:15] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0720000.pt +[2025-02-20 09:03:23] (step=0720100) Train Loss: 0.1503, Train Steps/Sec: 10.59, Grad Norm: 0.0060 +[2025-02-20 09:03:32] (step=0720200) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:03:41] (step=0720300) Train Loss: 0.1496, Train Steps/Sec: 10.29, Grad Norm: 0.0060 +[2025-02-20 09:03:50] (step=0720400) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 09:03:59] (step=0720500) Train Loss: 0.1491, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-20 09:04:09] (step=0720600) Train Loss: 0.1501, Train Steps/Sec: 9.99, Grad Norm: 0.0059 +[2025-02-20 09:04:18] (step=0720700) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 09:04:27] (step=0720800) Train Loss: 0.1492, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 09:04:35] (step=0720900) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 09:04:44] (step=0721000) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 09:04:53] (step=0721100) Train Loss: 0.1509, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 09:05:01] (step=0721200) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 09:05:10] (step=0721300) Train Loss: 0.1514, Train Steps/Sec: 11.57, Grad Norm: 0.0064 +[2025-02-20 09:05:19] (step=0721400) Train Loss: 0.1490, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 09:05:27] (step=0721500) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 09:05:36] (step=0721600) Train Loss: 0.1492, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 09:05:45] (step=0721700) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 09:05:53] (step=0721800) Train Loss: 0.1488, Train Steps/Sec: 11.55, Grad Norm: 0.0059 +[2025-02-20 09:06:03] (step=0721900) Train Loss: 0.1494, Train Steps/Sec: 10.10, Grad Norm: 0.0059 +[2025-02-20 09:06:12] (step=0722000) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 09:06:21] (step=0722100) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 09:06:29] (step=0722200) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 09:06:38] (step=0722300) Train Loss: 0.1506, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 09:06:47] (step=0722400) Train Loss: 0.1491, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-20 09:06:56] (step=0722500) Train Loss: 0.1512, Train Steps/Sec: 10.55, Grad Norm: 0.0059 +[2025-02-20 09:07:05] (step=0722600) Train Loss: 0.1508, Train Steps/Sec: 10.89, Grad Norm: 0.0062 +[2025-02-20 09:07:14] (step=0722700) Train Loss: 0.1492, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 09:07:23] (step=0722800) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 09:07:31] (step=0722900) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 09:07:40] (step=0723000) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 09:07:50] (step=0723100) Train Loss: 0.1504, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-20 09:07:59] (step=0723200) Train Loss: 0.1502, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 09:08:08] (step=0723300) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 09:08:16] (step=0723400) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:08:25] (step=0723500) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:08:33] (step=0723600) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:08:42] (step=0723700) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 09:08:51] (step=0723800) Train Loss: 0.1513, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 09:08:59] (step=0723900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 09:09:08] (step=0724000) Train Loss: 0.1489, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 09:09:17] (step=0724100) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 09:09:25] (step=0724200) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 09:09:34] (step=0724300) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 09:09:44] (step=0724400) Train Loss: 0.1497, Train Steps/Sec: 10.11, Grad Norm: 0.0060 +[2025-02-20 09:09:52] (step=0724500) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:10:02] (step=0724600) Train Loss: 0.1489, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-20 09:10:11] (step=0724700) Train Loss: 0.1507, Train Steps/Sec: 10.89, Grad Norm: 0.0059 +[2025-02-20 09:10:20] (step=0724800) Train Loss: 0.1487, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 09:10:29] (step=0724900) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:10:37] (step=0725000) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:10:46] (step=0725100) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:10:54] (step=0725200) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 09:11:03] (step=0725300) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:11:12] (step=0725400) Train Loss: 0.1482, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 09:11:20] (step=0725500) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:11:30] (step=0725600) Train Loss: 0.1501, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 09:11:39] (step=0725700) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:11:48] (step=0725800) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:11:56] (step=0725900) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 09:12:05] (step=0726000) Train Loss: 0.1496, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-20 09:12:14] (step=0726100) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 09:12:22] (step=0726200) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 09:12:31] (step=0726300) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 09:12:39] (step=0726400) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:12:48] (step=0726500) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:12:57] (step=0726600) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 09:13:05] (step=0726700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 09:13:15] (step=0726800) Train Loss: 0.1501, Train Steps/Sec: 10.90, Grad Norm: 0.0059 +[2025-02-20 09:13:25] (step=0726900) Train Loss: 0.1516, Train Steps/Sec: 9.47, Grad Norm: 0.0061 +[2025-02-20 09:13:34] (step=0727000) Train Loss: 0.1496, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 09:13:43] (step=0727100) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:13:51] (step=0727200) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 09:14:00] (step=0727300) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:14:09] (step=0727400) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:14:17] (step=0727500) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 09:14:26] (step=0727600) Train Loss: 0.1480, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-20 09:14:35] (step=0727700) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:14:43] (step=0727800) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:14:52] (step=0727900) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 09:15:00] (step=0728000) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 09:15:11] (step=0728100) Train Loss: 0.1511, Train Steps/Sec: 9.75, Grad Norm: 0.0064 +[2025-02-20 09:15:19] (step=0728200) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 09:15:28] (step=0728300) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:15:37] (step=0728400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:15:45] (step=0728500) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 09:15:54] (step=0728600) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 09:16:02] (step=0728700) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 09:16:11] (step=0728800) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:16:20] (step=0728900) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 09:16:29] (step=0729000) Train Loss: 0.1500, Train Steps/Sec: 10.33, Grad Norm: 0.0062 +[2025-02-20 09:16:38] (step=0729100) Train Loss: 0.1514, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 09:16:47] (step=0729200) Train Loss: 0.1509, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 09:16:56] (step=0729300) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:17:06] (step=0729400) Train Loss: 0.1503, Train Steps/Sec: 9.94, Grad Norm: 0.0067 +[2025-02-20 09:17:14] (step=0729500) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 09:17:23] (step=0729600) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:17:31] (step=0729700) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:17:40] (step=0729800) Train Loss: 0.1497, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 09:17:49] (step=0729900) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:17:58] (step=0730000) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0057 +[2025-02-20 09:18:06] (step=0730100) Train Loss: 0.1483, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:18:15] (step=0730200) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 09:18:24] (step=0730300) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 09:18:32] (step=0730400) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:18:41] (step=0730500) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:18:51] (step=0730600) Train Loss: 0.1499, Train Steps/Sec: 10.00, Grad Norm: 0.0061 +[2025-02-20 09:18:59] (step=0730700) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:19:08] (step=0730800) Train Loss: 0.1485, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:19:17] (step=0730900) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 09:19:25] (step=0731000) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 09:19:34] (step=0731100) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 09:19:44] (step=0731200) Train Loss: 0.1511, Train Steps/Sec: 10.59, Grad Norm: 0.0060 +[2025-02-20 09:19:53] (step=0731300) Train Loss: 0.1511, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-20 09:20:01] (step=0731400) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:20:10] (step=0731500) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:20:19] (step=0731600) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 09:20:27] (step=0731700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:20:36] (step=0731800) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:20:46] (step=0731900) Train Loss: 0.1505, Train Steps/Sec: 9.70, Grad Norm: 0.0062 +[2025-02-20 09:20:55] (step=0732000) Train Loss: 0.1481, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:21:03] (step=0732100) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:21:12] (step=0732200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 09:21:21] (step=0732300) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 09:21:29] (step=0732400) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 09:21:38] (step=0732500) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 09:21:47] (step=0732600) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 09:21:55] (step=0732700) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:22:04] (step=0732800) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:22:12] (step=0732900) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:22:21] (step=0733000) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:22:31] (step=0733100) Train Loss: 0.1499, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 09:22:40] (step=0733200) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:22:49] (step=0733300) Train Loss: 0.1501, Train Steps/Sec: 10.94, Grad Norm: 0.0064 +[2025-02-20 09:22:58] (step=0733400) Train Loss: 0.1499, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-20 09:23:07] (step=0733500) Train Loss: 0.1496, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 09:23:16] (step=0733600) Train Loss: 0.1502, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:23:24] (step=0733700) Train Loss: 0.1498, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 09:23:33] (step=0733800) Train Loss: 0.1486, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 09:23:41] (step=0733900) Train Loss: 0.1491, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:23:50] (step=0734000) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 09:23:59] (step=0734100) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 09:24:08] (step=0734200) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:24:16] (step=0734300) Train Loss: 0.1486, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:24:26] (step=0734400) Train Loss: 0.1514, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-20 09:24:35] (step=0734500) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:24:44] (step=0734600) Train Loss: 0.1492, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-20 09:24:52] (step=0734700) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 09:25:01] (step=0734800) Train Loss: 0.1483, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 09:25:09] (step=0734900) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:25:18] (step=0735000) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:25:27] (step=0735100) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:25:35] (step=0735200) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:25:44] (step=0735300) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:25:52] (step=0735400) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:26:02] (step=0735500) Train Loss: 0.1515, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-20 09:26:12] (step=0735600) Train Loss: 0.1502, Train Steps/Sec: 9.51, Grad Norm: 0.0063 +[2025-02-20 09:26:21] (step=0735700) Train Loss: 0.1511, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 09:26:30] (step=0735800) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:26:38] (step=0735900) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:26:47] (step=0736000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:26:56] (step=0736100) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:27:04] (step=0736200) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:27:13] (step=0736300) Train Loss: 0.1493, Train Steps/Sec: 11.24, Grad Norm: 0.0057 +[2025-02-20 09:27:22] (step=0736400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 09:27:30] (step=0736500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:27:39] (step=0736600) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 09:27:48] (step=0736700) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:27:56] (step=0736800) Train Loss: 0.1507, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 09:28:06] (step=0736900) Train Loss: 0.1491, Train Steps/Sec: 9.92, Grad Norm: 0.0059 +[2025-02-20 09:28:15] (step=0737000) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:28:24] (step=0737100) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 09:28:32] (step=0737200) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:28:41] (step=0737300) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:28:49] (step=0737400) Train Loss: 0.1488, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:28:58] (step=0737500) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 09:29:07] (step=0737600) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:29:16] (step=0737700) Train Loss: 0.1500, Train Steps/Sec: 10.30, Grad Norm: 0.0062 +[2025-02-20 09:29:25] (step=0737800) Train Loss: 0.1490, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 09:29:34] (step=0737900) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:29:43] (step=0738000) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:29:53] (step=0738100) Train Loss: 0.1513, Train Steps/Sec: 10.01, Grad Norm: 0.0065 +[2025-02-20 09:30:01] (step=0738200) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:30:10] (step=0738300) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:30:18] (step=0738400) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 09:30:27] (step=0738500) Train Loss: 0.1500, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 09:30:36] (step=0738600) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 09:30:45] (step=0738700) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:30:53] (step=0738800) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 09:31:02] (step=0738900) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 09:31:11] (step=0739000) Train Loss: 0.1491, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-20 09:31:19] (step=0739100) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:31:28] (step=0739200) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:31:36] (step=0739300) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 09:31:46] (step=0739400) Train Loss: 0.1508, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-20 09:31:55] (step=0739500) Train Loss: 0.1487, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:32:03] (step=0739600) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 09:32:12] (step=0739700) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:32:21] (step=0739800) Train Loss: 0.1503, Train Steps/Sec: 10.99, Grad Norm: 0.0058 +[2025-02-20 09:32:30] (step=0739900) Train Loss: 0.1499, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-20 09:32:39] (step=0740000) Train Loss: 0.1513, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-20 09:32:41] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0740000.pt +[2025-02-20 09:32:50] (step=0740100) Train Loss: 0.1506, Train Steps/Sec: 9.93, Grad Norm: 0.0061 +[2025-02-20 09:32:58] (step=0740200) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 09:33:07] (step=0740300) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:33:15] (step=0740400) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 09:33:24] (step=0740500) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 09:33:34] (step=0740600) Train Loss: 0.1497, Train Steps/Sec: 9.59, Grad Norm: 0.0061 +[2025-02-20 09:33:43] (step=0740700) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:33:52] (step=0740800) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:34:00] (step=0740900) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 09:34:09] (step=0741000) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 09:34:18] (step=0741100) Train Loss: 0.1499, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-20 09:34:26] (step=0741200) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:34:35] (step=0741300) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:34:43] (step=0741400) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:34:52] (step=0741500) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:35:01] (step=0741600) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:35:09] (step=0741700) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:35:18] (step=0741800) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:35:28] (step=0741900) Train Loss: 0.1505, Train Steps/Sec: 9.97, Grad Norm: 0.0063 +[2025-02-20 09:35:37] (step=0742000) Train Loss: 0.1512, Train Steps/Sec: 11.00, Grad Norm: 0.0063 +[2025-02-20 09:35:46] (step=0742100) Train Loss: 0.1498, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 09:35:55] (step=0742200) Train Loss: 0.1506, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 09:36:04] (step=0742300) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:36:12] (step=0742400) Train Loss: 0.1502, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 09:36:21] (step=0742500) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 09:36:30] (step=0742600) Train Loss: 0.1489, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:36:38] (step=0742700) Train Loss: 0.1502, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:36:47] (step=0742800) Train Loss: 0.1492, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 09:36:56] (step=0742900) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:37:04] (step=0743000) Train Loss: 0.1496, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:37:14] (step=0743100) Train Loss: 0.1503, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 09:37:23] (step=0743200) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:37:32] (step=0743300) Train Loss: 0.1498, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 09:37:40] (step=0743400) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:37:49] (step=0743500) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:37:57] (step=0743600) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 09:38:06] (step=0743700) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:38:14] (step=0743800) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 09:38:23] (step=0743900) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:38:32] (step=0744000) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 09:38:40] (step=0744100) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 09:38:50] (step=0744200) Train Loss: 0.1502, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-20 09:38:59] (step=0744300) Train Loss: 0.1495, Train Steps/Sec: 10.91, Grad Norm: 0.0059 +[2025-02-20 09:39:09] (step=0744400) Train Loss: 0.1489, Train Steps/Sec: 9.77, Grad Norm: 0.0059 +[2025-02-20 09:39:18] (step=0744500) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:39:26] (step=0744600) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 09:39:35] (step=0744700) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:39:43] (step=0744800) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:39:52] (step=0744900) Train Loss: 0.1487, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:40:01] (step=0745000) Train Loss: 0.1502, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 09:40:09] (step=0745100) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 09:40:18] (step=0745200) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 09:40:27] (step=0745300) Train Loss: 0.1508, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:40:35] (step=0745400) Train Loss: 0.1484, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 09:40:44] (step=0745500) Train Loss: 0.1504, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 09:40:54] (step=0745600) Train Loss: 0.1492, Train Steps/Sec: 10.01, Grad Norm: 0.0058 +[2025-02-20 09:41:03] (step=0745700) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 09:41:11] (step=0745800) Train Loss: 0.1501, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:41:20] (step=0745900) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:41:28] (step=0746000) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 09:41:37] (step=0746100) Train Loss: 0.1511, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 09:41:46] (step=0746200) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:41:54] (step=0746300) Train Loss: 0.1488, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-20 09:42:04] (step=0746400) Train Loss: 0.1503, Train Steps/Sec: 10.59, Grad Norm: 0.0063 +[2025-02-20 09:42:13] (step=0746500) Train Loss: 0.1495, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-20 09:42:22] (step=0746600) Train Loss: 0.1499, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:42:30] (step=0746700) Train Loss: 0.1495, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:42:39] (step=0746800) Train Loss: 0.1501, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:42:49] (step=0746900) Train Loss: 0.1503, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 09:42:57] (step=0747000) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:43:06] (step=0747100) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:43:15] (step=0747200) Train Loss: 0.1505, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 09:43:23] (step=0747300) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 09:43:32] (step=0747400) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:43:41] (step=0747500) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:43:49] (step=0747600) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:43:58] (step=0747700) Train Loss: 0.1487, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-20 09:44:07] (step=0747800) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 09:44:15] (step=0747900) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 09:44:24] (step=0748000) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 09:44:34] (step=0748100) Train Loss: 0.1499, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 09:44:42] (step=0748200) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 09:44:51] (step=0748300) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:45:00] (step=0748400) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:45:09] (step=0748500) Train Loss: 0.1500, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-20 09:45:18] (step=0748600) Train Loss: 0.1501, Train Steps/Sec: 10.91, Grad Norm: 0.0058 +[2025-02-20 09:45:27] (step=0748700) Train Loss: 0.1496, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-20 09:45:36] (step=0748800) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 09:45:44] (step=0748900) Train Loss: 0.1512, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:45:53] (step=0749000) Train Loss: 0.1500, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 09:46:01] (step=0749100) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:46:10] (step=0749200) Train Loss: 0.1487, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:46:19] (step=0749300) Train Loss: 0.1499, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 09:46:29] (step=0749400) Train Loss: 0.1493, Train Steps/Sec: 10.08, Grad Norm: 0.0058 +[2025-02-20 09:46:37] (step=0749500) Train Loss: 0.1497, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:46:46] (step=0749600) Train Loss: 0.1498, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-20 09:46:54] (step=0749700) Train Loss: 0.1498, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-20 09:47:03] (step=0749800) Train Loss: 0.1505, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-20 09:47:12] (step=0749900) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:47:20] (step=0750000) Train Loss: 0.1517, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:47:29] (step=0750100) Train Loss: 0.1496, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 09:47:38] (step=0750200) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0063 +[2025-02-20 09:47:46] (step=0750300) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 09:47:55] (step=0750400) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 09:48:03] (step=0750500) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 09:48:12] (step=0750600) Train Loss: 0.1503, Train Steps/Sec: 10.84, Grad Norm: 0.0063 +[2025-02-20 09:48:22] (step=0750700) Train Loss: 0.1498, Train Steps/Sec: 10.14, Grad Norm: 0.0062 +[2025-02-20 09:48:32] (step=0750800) Train Loss: 0.1507, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-20 09:48:41] (step=0750900) Train Loss: 0.1508, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 09:48:49] (step=0751000) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 09:48:58] (step=0751100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 09:49:07] (step=0751200) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 09:49:15] (step=0751300) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:49:24] (step=0751400) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:49:33] (step=0751500) Train Loss: 0.1484, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-20 09:49:41] (step=0751600) Train Loss: 0.1501, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 09:49:50] (step=0751700) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 09:49:58] (step=0751800) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 09:50:08] (step=0751900) Train Loss: 0.1501, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 09:50:17] (step=0752000) Train Loss: 0.1484, Train Steps/Sec: 11.32, Grad Norm: 0.0061 +[2025-02-20 09:50:26] (step=0752100) Train Loss: 0.1513, Train Steps/Sec: 11.67, Grad Norm: 0.0065 +[2025-02-20 09:50:34] (step=0752200) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:50:43] (step=0752300) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:50:51] (step=0752400) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:51:00] (step=0752500) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:51:09] (step=0752600) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:51:17] (step=0752700) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 09:51:26] (step=0752800) Train Loss: 0.1496, Train Steps/Sec: 11.31, Grad Norm: 0.0066 +[2025-02-20 09:51:35] (step=0752900) Train Loss: 0.1504, Train Steps/Sec: 10.62, Grad Norm: 0.0062 +[2025-02-20 09:51:44] (step=0753000) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 09:51:53] (step=0753100) Train Loss: 0.1501, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-20 09:52:03] (step=0753200) Train Loss: 0.1503, Train Steps/Sec: 10.08, Grad Norm: 0.0059 +[2025-02-20 09:52:12] (step=0753300) Train Loss: 0.1510, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:52:20] (step=0753400) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:52:29] (step=0753500) Train Loss: 0.1484, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:52:37] (step=0753600) Train Loss: 0.1494, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 09:52:46] (step=0753700) Train Loss: 0.1490, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 09:52:55] (step=0753800) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:53:03] (step=0753900) Train Loss: 0.1498, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:53:12] (step=0754000) Train Loss: 0.1501, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 09:53:21] (step=0754100) Train Loss: 0.1493, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-20 09:53:29] (step=0754200) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0064 +[2025-02-20 09:53:38] (step=0754300) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:53:48] (step=0754400) Train Loss: 0.1503, Train Steps/Sec: 10.05, Grad Norm: 0.0061 +[2025-02-20 09:53:57] (step=0754500) Train Loss: 0.1499, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 09:54:05] (step=0754600) Train Loss: 0.1504, Train Steps/Sec: 11.67, Grad Norm: 0.0063 +[2025-02-20 09:54:14] (step=0754700) Train Loss: 0.1495, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-20 09:54:22] (step=0754800) Train Loss: 0.1479, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 09:54:31] (step=0754900) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 09:54:40] (step=0755000) Train Loss: 0.1507, Train Steps/Sec: 11.32, Grad Norm: 0.0062 +[2025-02-20 09:54:49] (step=0755100) Train Loss: 0.1493, Train Steps/Sec: 10.63, Grad Norm: 0.0059 +[2025-02-20 09:54:58] (step=0755200) Train Loss: 0.1498, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-20 09:55:07] (step=0755300) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:55:15] (step=0755400) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 09:55:24] (step=0755500) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 09:55:33] (step=0755600) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:55:43] (step=0755700) Train Loss: 0.1492, Train Steps/Sec: 10.04, Grad Norm: 0.0061 +[2025-02-20 09:55:51] (step=0755800) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:56:00] (step=0755900) Train Loss: 0.1495, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 09:56:09] (step=0756000) Train Loss: 0.1495, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 09:56:17] (step=0756100) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 09:56:26] (step=0756200) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:56:35] (step=0756300) Train Loss: 0.1506, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 09:56:43] (step=0756400) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 09:56:52] (step=0756500) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:57:00] (step=0756600) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 09:57:09] (step=0756700) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:57:18] (step=0756800) Train Loss: 0.1514, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-20 09:57:28] (step=0756900) Train Loss: 0.1501, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-20 09:57:36] (step=0757000) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 09:57:45] (step=0757100) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:57:54] (step=0757200) Train Loss: 0.1493, Train Steps/Sec: 10.97, Grad Norm: 0.0061 +[2025-02-20 09:58:03] (step=0757300) Train Loss: 0.1502, Train Steps/Sec: 10.89, Grad Norm: 0.0060 +[2025-02-20 09:58:12] (step=0757400) Train Loss: 0.1506, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 09:58:21] (step=0757500) Train Loss: 0.1483, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 09:58:29] (step=0757600) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:58:38] (step=0757700) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 09:58:47] (step=0757800) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 09:58:55] (step=0757900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 09:59:04] (step=0758000) Train Loss: 0.1488, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 09:59:13] (step=0758100) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 09:59:23] (step=0758200) Train Loss: 0.1496, Train Steps/Sec: 10.08, Grad Norm: 0.0061 +[2025-02-20 09:59:31] (step=0758300) Train Loss: 0.1492, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 09:59:40] (step=0758400) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 09:59:49] (step=0758500) Train Loss: 0.1502, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 09:59:57] (step=0758600) Train Loss: 0.1498, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 10:00:06] (step=0758700) Train Loss: 0.1496, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 10:00:15] (step=0758800) Train Loss: 0.1492, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 10:00:23] (step=0758900) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:00:32] (step=0759000) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 10:00:40] (step=0759100) Train Loss: 0.1508, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 10:00:49] (step=0759200) Train Loss: 0.1495, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 10:00:57] (step=0759300) Train Loss: 0.1505, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 10:01:08] (step=0759400) Train Loss: 0.1507, Train Steps/Sec: 9.27, Grad Norm: 0.0062 +[2025-02-20 10:01:17] (step=0759500) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 10:01:26] (step=0759600) Train Loss: 0.1497, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 10:01:35] (step=0759700) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 10:01:44] (step=0759800) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:01:52] (step=0759900) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:02:01] (step=0760000) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:02:02] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0760000.pt +[2025-02-20 10:02:10] (step=0760100) Train Loss: 0.1483, Train Steps/Sec: 10.38, Grad Norm: 0.0059 +[2025-02-20 10:02:19] (step=0760200) Train Loss: 0.1491, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 10:02:28] (step=0760300) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:02:37] (step=0760400) Train Loss: 0.1480, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 10:02:45] (step=0760500) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 10:02:54] (step=0760600) Train Loss: 0.1499, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 10:03:04] (step=0760700) Train Loss: 0.1502, Train Steps/Sec: 9.96, Grad Norm: 0.0060 +[2025-02-20 10:03:13] (step=0760800) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 10:03:21] (step=0760900) Train Loss: 0.1485, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 10:03:30] (step=0761000) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 10:03:38] (step=0761100) Train Loss: 0.1482, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 10:03:47] (step=0761200) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 10:03:56] (step=0761300) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:04:04] (step=0761400) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:04:13] (step=0761500) Train Loss: 0.1505, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-20 10:04:22] (step=0761600) Train Loss: 0.1494, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-20 10:04:32] (step=0761700) Train Loss: 0.1514, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 10:04:40] (step=0761800) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 10:04:50] (step=0761900) Train Loss: 0.1496, Train Steps/Sec: 9.98, Grad Norm: 0.0065 +[2025-02-20 10:04:59] (step=0762000) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 10:05:07] (step=0762100) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:05:16] (step=0762200) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:05:25] (step=0762300) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 10:05:34] (step=0762400) Train Loss: 0.1490, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 10:05:42] (step=0762500) Train Loss: 0.1520, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:05:51] (step=0762600) Train Loss: 0.1485, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:05:59] (step=0762700) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:06:08] (step=0762800) Train Loss: 0.1484, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-20 10:06:17] (step=0762900) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 10:06:25] (step=0763000) Train Loss: 0.1523, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 10:06:34] (step=0763100) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 10:06:44] (step=0763200) Train Loss: 0.1505, Train Steps/Sec: 10.12, Grad Norm: 0.0061 +[2025-02-20 10:06:53] (step=0763300) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 10:07:01] (step=0763400) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 10:07:10] (step=0763500) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 10:07:18] (step=0763600) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:07:27] (step=0763700) Train Loss: 0.1503, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 10:07:37] (step=0763800) Train Loss: 0.1501, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-20 10:07:46] (step=0763900) Train Loss: 0.1502, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-20 10:07:54] (step=0764000) Train Loss: 0.1488, Train Steps/Sec: 11.65, Grad Norm: 0.0065 +[2025-02-20 10:08:03] (step=0764100) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 10:08:11] (step=0764200) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 10:08:20] (step=0764300) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 10:08:30] (step=0764400) Train Loss: 0.1499, Train Steps/Sec: 10.03, Grad Norm: 0.0062 +[2025-02-20 10:08:39] (step=0764500) Train Loss: 0.1499, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 10:08:48] (step=0764600) Train Loss: 0.1506, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 10:08:56] (step=0764700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0066 +[2025-02-20 10:09:05] (step=0764800) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:09:14] (step=0764900) Train Loss: 0.1502, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-20 10:09:22] (step=0765000) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 10:09:31] (step=0765100) Train Loss: 0.1488, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 10:09:39] (step=0765200) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 10:09:48] (step=0765300) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 10:09:56] (step=0765400) Train Loss: 0.1498, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 10:10:05] (step=0765500) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 10:10:14] (step=0765600) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:10:24] (step=0765700) Train Loss: 0.1502, Train Steps/Sec: 10.06, Grad Norm: 0.0059 +[2025-02-20 10:10:32] (step=0765800) Train Loss: 0.1500, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-20 10:10:41] (step=0765900) Train Loss: 0.1498, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-20 10:10:50] (step=0766000) Train Loss: 0.1503, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 10:11:00] (step=0766100) Train Loss: 0.1500, Train Steps/Sec: 10.96, Grad Norm: 0.0064 +[2025-02-20 10:11:08] (step=0766200) Train Loss: 0.1516, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 10:11:17] (step=0766300) Train Loss: 0.1499, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-20 10:11:25] (step=0766400) Train Loss: 0.1500, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-20 10:11:34] (step=0766500) Train Loss: 0.1504, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-20 10:11:42] (step=0766600) Train Loss: 0.1501, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-20 10:11:51] (step=0766700) Train Loss: 0.1507, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-20 10:12:00] (step=0766800) Train Loss: 0.1488, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-20 10:12:10] (step=0766900) Train Loss: 0.1508, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 10:12:18] (step=0767000) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 10:12:27] (step=0767100) Train Loss: 0.1498, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-20 10:12:36] (step=0767200) Train Loss: 0.1512, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-20 10:12:44] (step=0767300) Train Loss: 0.1494, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 10:12:53] (step=0767400) Train Loss: 0.1492, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-20 10:13:01] (step=0767500) Train Loss: 0.1496, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 10:13:10] (step=0767600) Train Loss: 0.1499, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 10:13:19] (step=0767700) Train Loss: 0.1491, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-20 10:13:27] (step=0767800) Train Loss: 0.1491, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 10:13:36] (step=0767900) Train Loss: 0.1510, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 10:13:44] (step=0768000) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:13:53] (step=0768100) Train Loss: 0.1499, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-20 10:14:04] (step=0768200) Train Loss: 0.1504, Train Steps/Sec: 9.18, Grad Norm: 0.0059 +[2025-02-20 10:14:13] (step=0768300) Train Loss: 0.1513, Train Steps/Sec: 11.29, Grad Norm: 0.0058 +[2025-02-20 10:14:22] (step=0768400) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:14:30] (step=0768500) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 10:14:39] (step=0768600) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:14:48] (step=0768700) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:14:56] (step=0768800) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 10:15:05] (step=0768900) Train Loss: 0.1476, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 10:15:14] (step=0769000) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:15:22] (step=0769100) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:15:31] (step=0769200) Train Loss: 0.1484, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:15:40] (step=0769300) Train Loss: 0.1491, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 10:15:50] (step=0769400) Train Loss: 0.1498, Train Steps/Sec: 9.94, Grad Norm: 0.0059 +[2025-02-20 10:15:58] (step=0769500) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 10:16:07] (step=0769600) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:16:16] (step=0769700) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:16:24] (step=0769800) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 10:16:33] (step=0769900) Train Loss: 0.1513, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 10:16:41] (step=0770000) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 10:16:50] (step=0770100) Train Loss: 0.1479, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:16:59] (step=0770200) Train Loss: 0.1490, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-20 10:17:08] (step=0770300) Train Loss: 0.1489, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 10:17:17] (step=0770400) Train Loss: 0.1501, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-20 10:17:26] (step=0770500) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 10:17:35] (step=0770600) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 10:17:45] (step=0770700) Train Loss: 0.1494, Train Steps/Sec: 10.02, Grad Norm: 0.0058 +[2025-02-20 10:17:53] (step=0770800) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:18:02] (step=0770900) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:18:10] (step=0771000) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:18:19] (step=0771100) Train Loss: 0.1494, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 10:18:28] (step=0771200) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:18:36] (step=0771300) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:18:45] (step=0771400) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 10:18:54] (step=0771500) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 10:19:03] (step=0771600) Train Loss: 0.1499, Train Steps/Sec: 11.52, Grad Norm: 0.0061 +[2025-02-20 10:19:11] (step=0771700) Train Loss: 0.1501, Train Steps/Sec: 11.52, Grad Norm: 0.0060 +[2025-02-20 10:19:20] (step=0771800) Train Loss: 0.1496, Train Steps/Sec: 11.53, Grad Norm: 0.0059 +[2025-02-20 10:19:30] (step=0771900) Train Loss: 0.1498, Train Steps/Sec: 9.96, Grad Norm: 0.0059 +[2025-02-20 10:19:39] (step=0772000) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 10:19:47] (step=0772100) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 10:19:56] (step=0772200) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 10:20:05] (step=0772300) Train Loss: 0.1485, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 10:20:13] (step=0772400) Train Loss: 0.1487, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-20 10:20:23] (step=0772500) Train Loss: 0.1484, Train Steps/Sec: 10.55, Grad Norm: 0.0059 +[2025-02-20 10:20:32] (step=0772600) Train Loss: 0.1502, Train Steps/Sec: 10.88, Grad Norm: 0.0061 +[2025-02-20 10:20:41] (step=0772700) Train Loss: 0.1505, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 10:20:49] (step=0772800) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 10:20:58] (step=0772900) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 10:21:07] (step=0773000) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 10:21:15] (step=0773100) Train Loss: 0.1507, Train Steps/Sec: 11.55, Grad Norm: 0.0063 +[2025-02-20 10:21:26] (step=0773200) Train Loss: 0.1489, Train Steps/Sec: 9.84, Grad Norm: 0.0061 +[2025-02-20 10:21:34] (step=0773300) Train Loss: 0.1497, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 10:21:43] (step=0773400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 10:21:52] (step=0773500) Train Loss: 0.1507, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 10:22:00] (step=0773600) Train Loss: 0.1490, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-20 10:22:09] (step=0773700) Train Loss: 0.1491, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 10:22:18] (step=0773800) Train Loss: 0.1490, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-20 10:22:26] (step=0773900) Train Loss: 0.1507, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-20 10:22:35] (step=0774000) Train Loss: 0.1502, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-20 10:22:43] (step=0774100) Train Loss: 0.1498, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-20 10:22:52] (step=0774200) Train Loss: 0.1504, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-20 10:23:00] (step=0774300) Train Loss: 0.1480, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-20 10:23:10] (step=0774400) Train Loss: 0.1508, Train Steps/Sec: 10.05, Grad Norm: 0.0060 +[2025-02-20 10:23:19] (step=0774500) Train Loss: 0.1492, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-20 10:23:28] (step=0774600) Train Loss: 0.1509, Train Steps/Sec: 10.97, Grad Norm: 0.0059 +[2025-02-20 10:23:37] (step=0774700) Train Loss: 0.1481, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 10:23:46] (step=0774800) Train Loss: 0.1509, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-20 10:23:55] (step=0774900) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:24:03] (step=0775000) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:24:12] (step=0775100) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:24:21] (step=0775200) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:24:29] (step=0775300) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:24:38] (step=0775400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 10:24:47] (step=0775500) Train Loss: 0.1508, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 10:24:55] (step=0775600) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 10:25:05] (step=0775700) Train Loss: 0.1495, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 10:25:14] (step=0775800) Train Loss: 0.1489, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 10:25:23] (step=0775900) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 10:25:32] (step=0776000) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:25:40] (step=0776100) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 10:25:49] (step=0776200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 10:25:57] (step=0776300) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:26:06] (step=0776400) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:26:15] (step=0776500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:26:23] (step=0776600) Train Loss: 0.1511, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:26:32] (step=0776700) Train Loss: 0.1492, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 10:26:41] (step=0776800) Train Loss: 0.1492, Train Steps/Sec: 10.93, Grad Norm: 0.0063 +[2025-02-20 10:26:52] (step=0776900) Train Loss: 0.1504, Train Steps/Sec: 9.45, Grad Norm: 0.0060 +[2025-02-20 10:27:01] (step=0777000) Train Loss: 0.1505, Train Steps/Sec: 11.24, Grad Norm: 0.0058 +[2025-02-20 10:27:09] (step=0777100) Train Loss: 0.1514, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 10:27:18] (step=0777200) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 10:27:27] (step=0777300) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 10:27:35] (step=0777400) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 10:27:44] (step=0777500) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 10:27:53] (step=0777600) Train Loss: 0.1501, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 10:28:01] (step=0777700) Train Loss: 0.1489, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 10:28:10] (step=0777800) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 10:28:19] (step=0777900) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 10:28:28] (step=0778000) Train Loss: 0.1491, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 10:28:36] (step=0778100) Train Loss: 0.1494, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 10:28:46] (step=0778200) Train Loss: 0.1508, Train Steps/Sec: 10.00, Grad Norm: 0.0059 +[2025-02-20 10:28:55] (step=0778300) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 10:29:03] (step=0778400) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 10:29:12] (step=0778500) Train Loss: 0.1486, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 10:29:21] (step=0778600) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 10:29:29] (step=0778700) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 10:29:38] (step=0778800) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 10:29:47] (step=0778900) Train Loss: 0.1509, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 10:29:56] (step=0779000) Train Loss: 0.1499, Train Steps/Sec: 10.61, Grad Norm: 0.0060 +[2025-02-20 10:30:05] (step=0779100) Train Loss: 0.1494, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-20 10:30:14] (step=0779200) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 10:30:23] (step=0779300) Train Loss: 0.1507, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 10:30:33] (step=0779400) Train Loss: 0.1492, Train Steps/Sec: 10.08, Grad Norm: 0.0062 +[2025-02-20 10:30:41] (step=0779500) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:30:50] (step=0779600) Train Loss: 0.1487, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 10:30:58] (step=0779700) Train Loss: 0.1492, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 10:31:07] (step=0779800) Train Loss: 0.1506, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 10:31:16] (step=0779900) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 10:31:24] (step=0780000) Train Loss: 0.1504, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-20 10:31:25] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0780000.pt +[2025-02-20 10:31:34] (step=0780100) Train Loss: 0.1492, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-20 10:31:42] (step=0780200) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 10:31:51] (step=0780300) Train Loss: 0.1495, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 10:32:00] (step=0780400) Train Loss: 0.1495, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 10:32:08] (step=0780500) Train Loss: 0.1510, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 10:32:17] (step=0780600) Train Loss: 0.1485, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:32:27] (step=0780700) Train Loss: 0.1510, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 10:32:35] (step=0780800) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:32:44] (step=0780900) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 10:32:53] (step=0781000) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:33:01] (step=0781100) Train Loss: 0.1504, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 10:33:11] (step=0781200) Train Loss: 0.1497, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-20 10:33:20] (step=0781300) Train Loss: 0.1495, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 10:33:29] (step=0781400) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 10:33:37] (step=0781500) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:33:46] (step=0781600) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:33:54] (step=0781700) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 10:34:03] (step=0781800) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:34:13] (step=0781900) Train Loss: 0.1497, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 10:34:22] (step=0782000) Train Loss: 0.1493, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 10:34:30] (step=0782100) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:34:39] (step=0782200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:34:48] (step=0782300) Train Loss: 0.1513, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 10:34:57] (step=0782400) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 10:35:05] (step=0782500) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:35:14] (step=0782600) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:35:22] (step=0782700) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:35:31] (step=0782800) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 10:35:40] (step=0782900) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 10:35:48] (step=0783000) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:35:57] (step=0783100) Train Loss: 0.1486, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 10:36:07] (step=0783200) Train Loss: 0.1488, Train Steps/Sec: 10.01, Grad Norm: 0.0059 +[2025-02-20 10:36:16] (step=0783300) Train Loss: 0.1499, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-20 10:36:25] (step=0783400) Train Loss: 0.1508, Train Steps/Sec: 10.88, Grad Norm: 0.0061 +[2025-02-20 10:36:34] (step=0783500) Train Loss: 0.1489, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 10:36:43] (step=0783600) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 10:36:52] (step=0783700) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 10:37:00] (step=0783800) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:37:09] (step=0783900) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:37:17] (step=0784000) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:37:26] (step=0784100) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:37:35] (step=0784200) Train Loss: 0.1491, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 10:37:44] (step=0784300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 10:37:54] (step=0784400) Train Loss: 0.1494, Train Steps/Sec: 9.72, Grad Norm: 0.0059 +[2025-02-20 10:38:02] (step=0784500) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 10:38:11] (step=0784600) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:38:20] (step=0784700) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 10:38:28] (step=0784800) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 10:38:37] (step=0784900) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:38:46] (step=0785000) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:38:54] (step=0785100) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 10:39:03] (step=0785200) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 10:39:11] (step=0785300) Train Loss: 0.1517, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:39:20] (step=0785400) Train Loss: 0.1509, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 10:39:30] (step=0785500) Train Loss: 0.1494, Train Steps/Sec: 10.58, Grad Norm: 0.0061 +[2025-02-20 10:39:39] (step=0785600) Train Loss: 0.1492, Train Steps/Sec: 11.17, Grad Norm: 0.0062 +[2025-02-20 10:39:49] (step=0785700) Train Loss: 0.1503, Train Steps/Sec: 9.75, Grad Norm: 0.0062 +[2025-02-20 10:39:58] (step=0785800) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 10:40:06] (step=0785900) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:40:15] (step=0786000) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:40:24] (step=0786100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 10:40:32] (step=0786200) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:40:41] (step=0786300) Train Loss: 0.1495, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 10:40:50] (step=0786400) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 10:40:58] (step=0786500) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:41:07] (step=0786600) Train Loss: 0.1489, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 10:41:16] (step=0786700) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 10:41:24] (step=0786800) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:41:34] (step=0786900) Train Loss: 0.1500, Train Steps/Sec: 10.06, Grad Norm: 0.0062 +[2025-02-20 10:41:43] (step=0787000) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:41:52] (step=0787100) Train Loss: 0.1485, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:42:00] (step=0787200) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:42:09] (step=0787300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 10:42:17] (step=0787400) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 10:42:26] (step=0787500) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 10:42:35] (step=0787600) Train Loss: 0.1489, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 10:42:44] (step=0787700) Train Loss: 0.1503, Train Steps/Sec: 10.61, Grad Norm: 0.0066 +[2025-02-20 10:42:53] (step=0787800) Train Loss: 0.1513, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-20 10:43:02] (step=0787900) Train Loss: 0.1498, Train Steps/Sec: 11.46, Grad Norm: 0.0062 +[2025-02-20 10:43:11] (step=0788000) Train Loss: 0.1502, Train Steps/Sec: 11.42, Grad Norm: 0.0060 +[2025-02-20 10:43:20] (step=0788100) Train Loss: 0.1498, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-20 10:43:30] (step=0788200) Train Loss: 0.1496, Train Steps/Sec: 9.91, Grad Norm: 0.0061 +[2025-02-20 10:43:38] (step=0788300) Train Loss: 0.1504, Train Steps/Sec: 11.52, Grad Norm: 0.0061 +[2025-02-20 10:43:47] (step=0788400) Train Loss: 0.1486, Train Steps/Sec: 11.52, Grad Norm: 0.0062 +[2025-02-20 10:43:56] (step=0788500) Train Loss: 0.1500, Train Steps/Sec: 11.15, Grad Norm: 0.0061 +[2025-02-20 10:44:05] (step=0788600) Train Loss: 0.1514, Train Steps/Sec: 11.52, Grad Norm: 0.0063 +[2025-02-20 10:44:13] (step=0788700) Train Loss: 0.1512, Train Steps/Sec: 11.52, Grad Norm: 0.0060 +[2025-02-20 10:44:22] (step=0788800) Train Loss: 0.1488, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-20 10:44:31] (step=0788900) Train Loss: 0.1508, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 10:44:40] (step=0789000) Train Loss: 0.1484, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 10:44:48] (step=0789100) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 10:44:57] (step=0789200) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 10:45:06] (step=0789300) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 10:45:16] (step=0789400) Train Loss: 0.1510, Train Steps/Sec: 10.01, Grad Norm: 0.0062 +[2025-02-20 10:45:24] (step=0789500) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 10:45:33] (step=0789600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:45:41] (step=0789700) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:45:51] (step=0789800) Train Loss: 0.1488, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 10:46:00] (step=0789900) Train Loss: 0.1501, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-20 10:46:09] (step=0790000) Train Loss: 0.1503, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-20 10:46:18] (step=0790100) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:46:26] (step=0790200) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:46:35] (step=0790300) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 10:46:43] (step=0790400) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:46:52] (step=0790500) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 10:47:01] (step=0790600) Train Loss: 0.1499, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 10:47:11] (step=0790700) Train Loss: 0.1507, Train Steps/Sec: 9.72, Grad Norm: 0.0058 +[2025-02-20 10:47:20] (step=0790800) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 10:47:28] (step=0790900) Train Loss: 0.1490, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 10:47:37] (step=0791000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:47:46] (step=0791100) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 10:47:54] (step=0791200) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:48:03] (step=0791300) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 10:48:11] (step=0791400) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:48:20] (step=0791500) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:48:29] (step=0791600) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 10:48:37] (step=0791700) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:48:46] (step=0791800) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:48:56] (step=0791900) Train Loss: 0.1497, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 10:49:05] (step=0792000) Train Loss: 0.1503, Train Steps/Sec: 10.89, Grad Norm: 0.0062 +[2025-02-20 10:49:15] (step=0792100) Train Loss: 0.1490, Train Steps/Sec: 10.28, Grad Norm: 0.0059 +[2025-02-20 10:49:23] (step=0792200) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 10:49:32] (step=0792300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 10:49:41] (step=0792400) Train Loss: 0.1517, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:49:49] (step=0792500) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:49:58] (step=0792600) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 10:50:06] (step=0792700) Train Loss: 0.1517, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 10:50:15] (step=0792800) Train Loss: 0.1490, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 10:50:24] (step=0792900) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 10:50:33] (step=0793000) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:50:41] (step=0793100) Train Loss: 0.1508, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 10:50:51] (step=0793200) Train Loss: 0.1508, Train Steps/Sec: 9.97, Grad Norm: 0.0059 +[2025-02-20 10:51:00] (step=0793300) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:51:09] (step=0793400) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 10:51:17] (step=0793500) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:51:26] (step=0793600) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 10:51:34] (step=0793700) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:51:43] (step=0793800) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 10:51:52] (step=0793900) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 10:52:00] (step=0794000) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 10:52:09] (step=0794100) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 10:52:19] (step=0794200) Train Loss: 0.1517, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-20 10:52:28] (step=0794300) Train Loss: 0.1498, Train Steps/Sec: 10.96, Grad Norm: 0.0063 +[2025-02-20 10:52:38] (step=0794400) Train Loss: 0.1492, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 10:52:46] (step=0794500) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 10:52:55] (step=0794600) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 10:53:04] (step=0794700) Train Loss: 0.1485, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 10:53:12] (step=0794800) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 10:53:21] (step=0794900) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 10:53:30] (step=0795000) Train Loss: 0.1501, Train Steps/Sec: 11.20, Grad Norm: 0.0059 +[2025-02-20 10:53:38] (step=0795100) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0066 +[2025-02-20 10:53:47] (step=0795200) Train Loss: 0.1485, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:53:56] (step=0795300) Train Loss: 0.1500, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 10:54:05] (step=0795400) Train Loss: 0.1498, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 10:54:13] (step=0795500) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 10:54:22] (step=0795600) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 10:54:32] (step=0795700) Train Loss: 0.1504, Train Steps/Sec: 10.07, Grad Norm: 0.0059 +[2025-02-20 10:54:40] (step=0795800) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:54:49] (step=0795900) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:54:58] (step=0796000) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:55:06] (step=0796100) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 10:55:15] (step=0796200) Train Loss: 0.1481, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 10:55:24] (step=0796300) Train Loss: 0.1508, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 10:55:33] (step=0796400) Train Loss: 0.1490, Train Steps/Sec: 10.50, Grad Norm: 0.0061 +[2025-02-20 10:55:43] (step=0796500) Train Loss: 0.1507, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-20 10:55:51] (step=0796600) Train Loss: 0.1508, Train Steps/Sec: 11.55, Grad Norm: 0.0062 +[2025-02-20 10:56:00] (step=0796700) Train Loss: 0.1498, Train Steps/Sec: 11.55, Grad Norm: 0.0058 +[2025-02-20 10:56:08] (step=0796800) Train Loss: 0.1509, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 10:56:18] (step=0796900) Train Loss: 0.1509, Train Steps/Sec: 10.01, Grad Norm: 0.0063 +[2025-02-20 10:56:27] (step=0797000) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 10:56:36] (step=0797100) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 10:56:45] (step=0797200) Train Loss: 0.1495, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 10:56:53] (step=0797300) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 10:57:02] (step=0797400) Train Loss: 0.1492, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 10:57:11] (step=0797500) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:57:19] (step=0797600) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:57:28] (step=0797700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 10:57:37] (step=0797800) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 10:57:45] (step=0797900) Train Loss: 0.1482, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 10:57:54] (step=0798000) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 10:58:02] (step=0798100) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 10:58:12] (step=0798200) Train Loss: 0.1492, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 10:58:21] (step=0798300) Train Loss: 0.1501, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 10:58:30] (step=0798400) Train Loss: 0.1492, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 10:58:39] (step=0798500) Train Loss: 0.1487, Train Steps/Sec: 10.87, Grad Norm: 0.0062 +[2025-02-20 10:58:48] (step=0798600) Train Loss: 0.1507, Train Steps/Sec: 10.55, Grad Norm: 0.0063 +[2025-02-20 10:58:57] (step=0798700) Train Loss: 0.1500, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 10:59:06] (step=0798800) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 10:59:15] (step=0798900) Train Loss: 0.1498, Train Steps/Sec: 11.54, Grad Norm: 0.0058 +[2025-02-20 10:59:23] (step=0799000) Train Loss: 0.1496, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 10:59:32] (step=0799100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 10:59:40] (step=0799200) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 10:59:49] (step=0799300) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 10:59:59] (step=0799400) Train Loss: 0.1496, Train Steps/Sec: 9.70, Grad Norm: 0.0060 +[2025-02-20 11:00:08] (step=0799500) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 11:00:17] (step=0799600) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 11:00:26] (step=0799700) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:00:34] (step=0799800) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:00:43] (step=0799900) Train Loss: 0.1487, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:00:51] (step=0800000) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:00:52] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0800000.pt +[2025-02-20 11:01:01] (step=0800100) Train Loss: 0.1489, Train Steps/Sec: 10.38, Grad Norm: 0.0060 +[2025-02-20 11:01:10] (step=0800200) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 11:01:18] (step=0800300) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 11:01:27] (step=0800400) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:01:36] (step=0800500) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 11:01:44] (step=0800600) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 11:01:55] (step=0800700) Train Loss: 0.1507, Train Steps/Sec: 9.16, Grad Norm: 0.0061 +[2025-02-20 11:02:05] (step=0800800) Train Loss: 0.1514, Train Steps/Sec: 10.54, Grad Norm: 0.0061 +[2025-02-20 11:02:13] (step=0800900) Train Loss: 0.1497, Train Steps/Sec: 11.55, Grad Norm: 0.0061 +[2025-02-20 11:02:22] (step=0801000) Train Loss: 0.1497, Train Steps/Sec: 11.54, Grad Norm: 0.0062 +[2025-02-20 11:02:31] (step=0801100) Train Loss: 0.1495, Train Steps/Sec: 11.53, Grad Norm: 0.0065 +[2025-02-20 11:02:39] (step=0801200) Train Loss: 0.1509, Train Steps/Sec: 11.53, Grad Norm: 0.0060 +[2025-02-20 11:02:48] (step=0801300) Train Loss: 0.1492, Train Steps/Sec: 11.54, Grad Norm: 0.0059 +[2025-02-20 11:02:57] (step=0801400) Train Loss: 0.1509, Train Steps/Sec: 11.54, Grad Norm: 0.0059 +[2025-02-20 11:03:06] (step=0801500) Train Loss: 0.1499, Train Steps/Sec: 11.14, Grad Norm: 0.0063 +[2025-02-20 11:03:14] (step=0801600) Train Loss: 0.1510, Train Steps/Sec: 11.54, Grad Norm: 0.0063 +[2025-02-20 11:03:23] (step=0801700) Train Loss: 0.1508, Train Steps/Sec: 11.54, Grad Norm: 0.0062 +[2025-02-20 11:03:32] (step=0801800) Train Loss: 0.1509, Train Steps/Sec: 11.13, Grad Norm: 0.0061 +[2025-02-20 11:03:42] (step=0801900) Train Loss: 0.1506, Train Steps/Sec: 9.85, Grad Norm: 0.0060 +[2025-02-20 11:03:51] (step=0802000) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:03:59] (step=0802100) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:04:08] (step=0802200) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 11:04:16] (step=0802300) Train Loss: 0.1508, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:04:25] (step=0802400) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:04:34] (step=0802500) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:04:42] (step=0802600) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:04:51] (step=0802700) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:05:00] (step=0802800) Train Loss: 0.1510, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 11:05:09] (step=0802900) Train Loss: 0.1491, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 11:05:18] (step=0803000) Train Loss: 0.1493, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-20 11:05:27] (step=0803100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:05:37] (step=0803200) Train Loss: 0.1511, Train Steps/Sec: 9.88, Grad Norm: 0.0062 +[2025-02-20 11:05:46] (step=0803300) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 11:05:54] (step=0803400) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 11:06:03] (step=0803500) Train Loss: 0.1483, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 11:06:12] (step=0803600) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 11:06:21] (step=0803700) Train Loss: 0.1505, Train Steps/Sec: 11.19, Grad Norm: 0.0063 +[2025-02-20 11:06:29] (step=0803800) Train Loss: 0.1508, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 11:06:38] (step=0803900) Train Loss: 0.1493, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 11:06:47] (step=0804000) Train Loss: 0.1486, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-20 11:06:56] (step=0804100) Train Loss: 0.1495, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 11:07:04] (step=0804200) Train Loss: 0.1514, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 11:07:13] (step=0804300) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 11:07:23] (step=0804400) Train Loss: 0.1507, Train Steps/Sec: 10.00, Grad Norm: 0.0061 +[2025-02-20 11:07:31] (step=0804500) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:07:40] (step=0804600) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 11:07:49] (step=0804700) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 11:07:57] (step=0804800) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:08:06] (step=0804900) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:08:15] (step=0805000) Train Loss: 0.1497, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 11:08:24] (step=0805100) Train Loss: 0.1499, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-20 11:08:33] (step=0805200) Train Loss: 0.1502, Train Steps/Sec: 10.95, Grad Norm: 0.0058 +[2025-02-20 11:08:42] (step=0805300) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:08:51] (step=0805400) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:08:59] (step=0805500) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:09:08] (step=0805600) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0057 +[2025-02-20 11:09:18] (step=0805700) Train Loss: 0.1511, Train Steps/Sec: 10.03, Grad Norm: 0.0064 +[2025-02-20 11:09:26] (step=0805800) Train Loss: 0.1491, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 11:09:35] (step=0805900) Train Loss: 0.1499, Train Steps/Sec: 11.18, Grad Norm: 0.0059 +[2025-02-20 11:09:44] (step=0806000) Train Loss: 0.1506, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 11:09:53] (step=0806100) Train Loss: 0.1513, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-20 11:10:02] (step=0806200) Train Loss: 0.1490, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 11:10:10] (step=0806300) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 11:10:19] (step=0806400) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:10:27] (step=0806500) Train Loss: 0.1489, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 11:10:36] (step=0806600) Train Loss: 0.1509, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 11:10:45] (step=0806700) Train Loss: 0.1497, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 11:10:53] (step=0806800) Train Loss: 0.1490, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 11:11:03] (step=0806900) Train Loss: 0.1507, Train Steps/Sec: 9.99, Grad Norm: 0.0059 +[2025-02-20 11:11:12] (step=0807000) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 11:11:21] (step=0807100) Train Loss: 0.1492, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 11:11:30] (step=0807200) Train Loss: 0.1500, Train Steps/Sec: 10.87, Grad Norm: 0.0064 +[2025-02-20 11:11:39] (step=0807300) Train Loss: 0.1500, Train Steps/Sec: 10.58, Grad Norm: 0.0063 +[2025-02-20 11:11:48] (step=0807400) Train Loss: 0.1484, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 11:11:57] (step=0807500) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 11:12:06] (step=0807600) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 11:12:14] (step=0807700) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 11:12:23] (step=0807800) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:12:31] (step=0807900) Train Loss: 0.1477, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:12:40] (step=0808000) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 11:12:49] (step=0808100) Train Loss: 0.1496, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 11:12:59] (step=0808200) Train Loss: 0.1490, Train Steps/Sec: 9.91, Grad Norm: 0.0060 +[2025-02-20 11:13:08] (step=0808300) Train Loss: 0.1499, Train Steps/Sec: 11.23, Grad Norm: 0.0064 +[2025-02-20 11:13:16] (step=0808400) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 11:13:25] (step=0808500) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 11:13:34] (step=0808600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:13:42] (step=0808700) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 11:13:51] (step=0808800) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:13:59] (step=0808900) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:14:08] (step=0809000) Train Loss: 0.1482, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 11:14:17] (step=0809100) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:14:25] (step=0809200) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 11:14:34] (step=0809300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 11:14:45] (step=0809400) Train Loss: 0.1494, Train Steps/Sec: 9.22, Grad Norm: 0.0061 +[2025-02-20 11:14:54] (step=0809500) Train Loss: 0.1501, Train Steps/Sec: 10.60, Grad Norm: 0.0058 +[2025-02-20 11:15:03] (step=0809600) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 11:15:11] (step=0809700) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 11:15:20] (step=0809800) Train Loss: 0.1519, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:15:29] (step=0809900) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:15:37] (step=0810000) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:15:46] (step=0810100) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 11:15:55] (step=0810200) Train Loss: 0.1505, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-20 11:16:03] (step=0810300) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:16:12] (step=0810400) Train Loss: 0.1498, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 11:16:21] (step=0810500) Train Loss: 0.1488, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-20 11:16:30] (step=0810600) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 11:16:40] (step=0810700) Train Loss: 0.1506, Train Steps/Sec: 9.88, Grad Norm: 0.0062 +[2025-02-20 11:16:48] (step=0810800) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:16:57] (step=0810900) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 11:17:06] (step=0811000) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:17:14] (step=0811100) Train Loss: 0.1510, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:17:23] (step=0811200) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:17:32] (step=0811300) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 11:17:40] (step=0811400) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:17:49] (step=0811500) Train Loss: 0.1501, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 11:17:58] (step=0811600) Train Loss: 0.1498, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-20 11:18:08] (step=0811700) Train Loss: 0.1490, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 11:18:16] (step=0811800) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:18:26] (step=0811900) Train Loss: 0.1509, Train Steps/Sec: 9.93, Grad Norm: 0.0059 +[2025-02-20 11:18:35] (step=0812000) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:18:44] (step=0812100) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 11:18:52] (step=0812200) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 11:19:01] (step=0812300) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 11:19:10] (step=0812400) Train Loss: 0.1480, Train Steps/Sec: 11.18, Grad Norm: 0.0058 +[2025-02-20 11:19:18] (step=0812500) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:19:27] (step=0812600) Train Loss: 0.1493, Train Steps/Sec: 11.22, Grad Norm: 0.0064 +[2025-02-20 11:19:36] (step=0812700) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:19:44] (step=0812800) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:19:53] (step=0812900) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:20:02] (step=0813000) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 11:20:10] (step=0813100) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:20:20] (step=0813200) Train Loss: 0.1505, Train Steps/Sec: 9.87, Grad Norm: 0.0060 +[2025-02-20 11:20:29] (step=0813300) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 11:20:38] (step=0813400) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 11:20:46] (step=0813500) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 11:20:55] (step=0813600) Train Loss: 0.1497, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 11:21:04] (step=0813700) Train Loss: 0.1506, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-20 11:21:13] (step=0813800) Train Loss: 0.1499, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 11:21:22] (step=0813900) Train Loss: 0.1499, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 11:21:31] (step=0814000) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 11:21:40] (step=0814100) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:21:48] (step=0814200) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 11:21:57] (step=0814300) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:22:06] (step=0814400) Train Loss: 0.1488, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 11:22:15] (step=0814500) Train Loss: 0.1485, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-20 11:22:24] (step=0814600) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0064 +[2025-02-20 11:22:33] (step=0814700) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 11:22:42] (step=0814800) Train Loss: 0.1497, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 11:22:50] (step=0814900) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 11:22:59] (step=0815000) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 11:23:08] (step=0815100) Train Loss: 0.1498, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 11:23:16] (step=0815200) Train Loss: 0.1487, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 11:23:25] (step=0815300) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 11:23:33] (step=0815400) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 11:23:42] (step=0815500) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 11:23:50] (step=0815600) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 11:24:00] (step=0815700) Train Loss: 0.1493, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 11:24:09] (step=0815800) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 11:24:18] (step=0815900) Train Loss: 0.1509, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-20 11:24:28] (step=0816000) Train Loss: 0.1504, Train Steps/Sec: 10.97, Grad Norm: 0.0060 +[2025-02-20 11:24:36] (step=0816100) Train Loss: 0.1505, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 11:24:45] (step=0816200) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 11:24:54] (step=0816300) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 11:25:02] (step=0816400) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 11:25:11] (step=0816500) Train Loss: 0.1517, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 11:25:19] (step=0816600) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 11:25:28] (step=0816700) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 11:25:37] (step=0816800) Train Loss: 0.1501, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 11:25:45] (step=0816900) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 11:25:56] (step=0817000) Train Loss: 0.1485, Train Steps/Sec: 9.62, Grad Norm: 0.0059 +[2025-02-20 11:26:04] (step=0817100) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 11:26:13] (step=0817200) Train Loss: 0.1503, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 11:26:22] (step=0817300) Train Loss: 0.1503, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 11:26:30] (step=0817400) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 11:26:39] (step=0817500) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 11:26:47] (step=0817600) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 11:26:56] (step=0817700) Train Loss: 0.1496, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 11:27:05] (step=0817800) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 11:27:13] (step=0817900) Train Loss: 0.1513, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 11:27:22] (step=0818000) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 11:27:31] (step=0818100) Train Loss: 0.1490, Train Steps/Sec: 10.63, Grad Norm: 0.0059 +[2025-02-20 11:27:42] (step=0818200) Train Loss: 0.1501, Train Steps/Sec: 9.19, Grad Norm: 0.0062 +[2025-02-20 11:27:51] (step=0818300) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:27:59] (step=0818400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 11:28:08] (step=0818500) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 11:28:16] (step=0818600) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 11:28:25] (step=0818700) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:28:34] (step=0818800) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:28:43] (step=0818900) Train Loss: 0.1491, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 11:28:51] (step=0819000) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:29:00] (step=0819100) Train Loss: 0.1509, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 11:29:09] (step=0819200) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:29:17] (step=0819300) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:29:26] (step=0819400) Train Loss: 0.1506, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 11:29:36] (step=0819500) Train Loss: 0.1499, Train Steps/Sec: 9.93, Grad Norm: 0.0060 +[2025-02-20 11:29:45] (step=0819600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:29:53] (step=0819700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 11:30:02] (step=0819800) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 11:30:10] (step=0819900) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:30:19] (step=0820000) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:30:19] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0820000.pt +[2025-02-20 11:30:28] (step=0820100) Train Loss: 0.1495, Train Steps/Sec: 11.07, Grad Norm: 0.0065 +[2025-02-20 11:30:37] (step=0820200) Train Loss: 0.1512, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 11:30:46] (step=0820300) Train Loss: 0.1500, Train Steps/Sec: 10.60, Grad Norm: 0.0064 +[2025-02-20 11:30:55] (step=0820400) Train Loss: 0.1492, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-20 11:31:04] (step=0820500) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:31:13] (step=0820600) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 11:31:23] (step=0820700) Train Loss: 0.1488, Train Steps/Sec: 9.93, Grad Norm: 0.0060 +[2025-02-20 11:31:31] (step=0820800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 11:31:40] (step=0820900) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:31:49] (step=0821000) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:31:57] (step=0821100) Train Loss: 0.1505, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 11:32:06] (step=0821200) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:32:15] (step=0821300) Train Loss: 0.1493, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 11:32:24] (step=0821400) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:32:32] (step=0821500) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:32:41] (step=0821600) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:32:49] (step=0821700) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:32:58] (step=0821800) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 11:33:07] (step=0821900) Train Loss: 0.1493, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 11:33:17] (step=0822000) Train Loss: 0.1508, Train Steps/Sec: 9.83, Grad Norm: 0.0060 +[2025-02-20 11:33:26] (step=0822100) Train Loss: 0.1500, Train Steps/Sec: 11.49, Grad Norm: 0.0060 +[2025-02-20 11:33:34] (step=0822200) Train Loss: 0.1506, Train Steps/Sec: 11.49, Grad Norm: 0.0062 +[2025-02-20 11:33:43] (step=0822300) Train Loss: 0.1504, Train Steps/Sec: 11.49, Grad Norm: 0.0058 +[2025-02-20 11:33:52] (step=0822400) Train Loss: 0.1510, Train Steps/Sec: 10.80, Grad Norm: 0.0061 +[2025-02-20 11:34:02] (step=0822500) Train Loss: 0.1507, Train Steps/Sec: 10.79, Grad Norm: 0.0060 +[2025-02-20 11:34:11] (step=0822600) Train Loss: 0.1497, Train Steps/Sec: 10.86, Grad Norm: 0.0059 +[2025-02-20 11:34:19] (step=0822700) Train Loss: 0.1498, Train Steps/Sec: 11.50, Grad Norm: 0.0062 +[2025-02-20 11:34:28] (step=0822800) Train Loss: 0.1503, Train Steps/Sec: 11.51, Grad Norm: 0.0061 +[2025-02-20 11:34:37] (step=0822900) Train Loss: 0.1494, Train Steps/Sec: 11.53, Grad Norm: 0.0061 +[2025-02-20 11:34:45] (step=0823000) Train Loss: 0.1502, Train Steps/Sec: 11.53, Grad Norm: 0.0062 +[2025-02-20 11:34:54] (step=0823100) Train Loss: 0.1506, Train Steps/Sec: 11.53, Grad Norm: 0.0061 +[2025-02-20 11:35:04] (step=0823200) Train Loss: 0.1504, Train Steps/Sec: 9.87, Grad Norm: 0.0059 +[2025-02-20 11:35:13] (step=0823300) Train Loss: 0.1496, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-20 11:35:22] (step=0823400) Train Loss: 0.1510, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 11:35:31] (step=0823500) Train Loss: 0.1497, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-20 11:35:39] (step=0823600) Train Loss: 0.1492, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 11:35:48] (step=0823700) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 11:35:57] (step=0823800) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:36:05] (step=0823900) Train Loss: 0.1478, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:36:14] (step=0824000) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:36:22] (step=0824100) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 11:36:31] (step=0824200) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 11:36:40] (step=0824300) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:36:48] (step=0824400) Train Loss: 0.1490, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 11:36:58] (step=0824500) Train Loss: 0.1504, Train Steps/Sec: 10.06, Grad Norm: 0.0062 +[2025-02-20 11:37:08] (step=0824600) Train Loss: 0.1494, Train Steps/Sec: 10.27, Grad Norm: 0.0061 +[2025-02-20 11:37:17] (step=0824700) Train Loss: 0.1496, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 11:37:26] (step=0824800) Train Loss: 0.1504, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 11:37:34] (step=0824900) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 11:37:43] (step=0825000) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 11:37:52] (step=0825100) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 11:38:00] (step=0825200) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 11:38:09] (step=0825300) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 11:38:17] (step=0825400) Train Loss: 0.1508, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 11:38:26] (step=0825500) Train Loss: 0.1500, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 11:38:35] (step=0825600) Train Loss: 0.1503, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 11:38:45] (step=0825700) Train Loss: 0.1505, Train Steps/Sec: 9.98, Grad Norm: 0.0061 +[2025-02-20 11:38:54] (step=0825800) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:39:02] (step=0825900) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:39:11] (step=0826000) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:39:19] (step=0826100) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:39:28] (step=0826200) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:39:37] (step=0826300) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 11:39:45] (step=0826400) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:39:54] (step=0826500) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:40:02] (step=0826600) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:40:11] (step=0826700) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 11:40:21] (step=0826800) Train Loss: 0.1496, Train Steps/Sec: 10.27, Grad Norm: 0.0062 +[2025-02-20 11:40:30] (step=0826900) Train Loss: 0.1482, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 11:40:40] (step=0827000) Train Loss: 0.1502, Train Steps/Sec: 9.92, Grad Norm: 0.0063 +[2025-02-20 11:40:49] (step=0827100) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 11:40:57] (step=0827200) Train Loss: 0.1488, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 11:41:06] (step=0827300) Train Loss: 0.1492, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:41:15] (step=0827400) Train Loss: 0.1491, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 11:41:23] (step=0827500) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:41:32] (step=0827600) Train Loss: 0.1497, Train Steps/Sec: 11.21, Grad Norm: 0.0062 +[2025-02-20 11:41:41] (step=0827700) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 11:41:50] (step=0827800) Train Loss: 0.1506, Train Steps/Sec: 11.21, Grad Norm: 0.0064 +[2025-02-20 11:41:58] (step=0827900) Train Loss: 0.1509, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 11:42:07] (step=0828000) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 11:42:16] (step=0828100) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 11:42:26] (step=0828200) Train Loss: 0.1490, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 11:42:34] (step=0828300) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 11:42:43] (step=0828400) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 11:42:52] (step=0828500) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:43:00] (step=0828600) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 11:43:09] (step=0828700) Train Loss: 0.1501, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 11:43:17] (step=0828800) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 11:43:26] (step=0828900) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-20 11:43:36] (step=0829000) Train Loss: 0.1503, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 11:43:45] (step=0829100) Train Loss: 0.1497, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 11:43:54] (step=0829200) Train Loss: 0.1508, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 11:44:02] (step=0829300) Train Loss: 0.1495, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 11:44:11] (step=0829400) Train Loss: 0.1499, Train Steps/Sec: 11.53, Grad Norm: 0.0061 +[2025-02-20 11:44:21] (step=0829500) Train Loss: 0.1501, Train Steps/Sec: 9.97, Grad Norm: 0.0058 +[2025-02-20 11:44:30] (step=0829600) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 11:44:38] (step=0829700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:44:47] (step=0829800) Train Loss: 0.1500, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-20 11:44:56] (step=0829900) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 11:45:05] (step=0830000) Train Loss: 0.1491, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 11:45:13] (step=0830100) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:45:22] (step=0830200) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:45:30] (step=0830300) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:45:39] (step=0830400) Train Loss: 0.1510, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:45:48] (step=0830500) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:45:56] (step=0830600) Train Loss: 0.1493, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 11:46:06] (step=0830700) Train Loss: 0.1502, Train Steps/Sec: 10.04, Grad Norm: 0.0062 +[2025-02-20 11:46:15] (step=0830800) Train Loss: 0.1505, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 11:46:24] (step=0830900) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 11:46:32] (step=0831000) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 11:46:41] (step=0831100) Train Loss: 0.1493, Train Steps/Sec: 10.86, Grad Norm: 0.0061 +[2025-02-20 11:46:51] (step=0831200) Train Loss: 0.1491, Train Steps/Sec: 10.86, Grad Norm: 0.0060 +[2025-02-20 11:47:00] (step=0831300) Train Loss: 0.1481, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 11:47:08] (step=0831400) Train Loss: 0.1489, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 11:47:17] (step=0831500) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 11:47:26] (step=0831600) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0065 +[2025-02-20 11:47:34] (step=0831700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 11:47:43] (step=0831800) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 11:47:52] (step=0831900) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0058 +[2025-02-20 11:48:02] (step=0832000) Train Loss: 0.1499, Train Steps/Sec: 9.71, Grad Norm: 0.0061 +[2025-02-20 11:48:11] (step=0832100) Train Loss: 0.1496, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 11:48:19] (step=0832200) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:48:28] (step=0832300) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 11:48:37] (step=0832400) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:48:45] (step=0832500) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:48:54] (step=0832600) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 11:49:03] (step=0832700) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:49:11] (step=0832800) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:49:20] (step=0832900) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:49:28] (step=0833000) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 11:49:37] (step=0833100) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:49:47] (step=0833200) Train Loss: 0.1508, Train Steps/Sec: 9.87, Grad Norm: 0.0061 +[2025-02-20 11:49:57] (step=0833300) Train Loss: 0.1494, Train Steps/Sec: 10.25, Grad Norm: 0.0060 +[2025-02-20 11:50:06] (step=0833400) Train Loss: 0.1514, Train Steps/Sec: 10.97, Grad Norm: 0.0061 +[2025-02-20 11:50:15] (step=0833500) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:50:23] (step=0833600) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 11:50:32] (step=0833700) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 11:50:40] (step=0833800) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:50:49] (step=0833900) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 11:50:58] (step=0834000) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 11:51:06] (step=0834100) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 11:51:15] (step=0834200) Train Loss: 0.1501, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 11:51:24] (step=0834300) Train Loss: 0.1505, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 11:51:33] (step=0834400) Train Loss: 0.1488, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:51:43] (step=0834500) Train Loss: 0.1516, Train Steps/Sec: 9.98, Grad Norm: 0.0061 +[2025-02-20 11:51:51] (step=0834600) Train Loss: 0.1485, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 11:52:00] (step=0834700) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 11:52:09] (step=0834800) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 11:52:17] (step=0834900) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:52:26] (step=0835000) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:52:34] (step=0835100) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:52:43] (step=0835200) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:52:52] (step=0835300) Train Loss: 0.1491, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 11:53:00] (step=0835400) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:53:10] (step=0835500) Train Loss: 0.1493, Train Steps/Sec: 10.27, Grad Norm: 0.0060 +[2025-02-20 11:53:19] (step=0835600) Train Loss: 0.1499, Train Steps/Sec: 10.93, Grad Norm: 0.0063 +[2025-02-20 11:53:29] (step=0835700) Train Loss: 0.1482, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-20 11:53:38] (step=0835800) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 11:53:46] (step=0835900) Train Loss: 0.1507, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 11:53:55] (step=0836000) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:54:04] (step=0836100) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:54:12] (step=0836200) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 11:54:21] (step=0836300) Train Loss: 0.1493, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 11:54:30] (step=0836400) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:54:39] (step=0836500) Train Loss: 0.1495, Train Steps/Sec: 11.23, Grad Norm: 0.0065 +[2025-02-20 11:54:47] (step=0836600) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:54:56] (step=0836700) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 11:55:05] (step=0836800) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 11:55:13] (step=0836900) Train Loss: 0.1506, Train Steps/Sec: 11.54, Grad Norm: 0.0059 +[2025-02-20 11:55:23] (step=0837000) Train Loss: 0.1509, Train Steps/Sec: 9.97, Grad Norm: 0.0063 +[2025-02-20 11:55:32] (step=0837100) Train Loss: 0.1501, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 11:55:41] (step=0837200) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0065 +[2025-02-20 11:55:49] (step=0837300) Train Loss: 0.1511, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 11:55:58] (step=0837400) Train Loss: 0.1495, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 11:56:06] (step=0837500) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 11:56:15] (step=0837600) Train Loss: 0.1501, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 11:56:25] (step=0837700) Train Loss: 0.1498, Train Steps/Sec: 10.57, Grad Norm: 0.0060 +[2025-02-20 11:56:34] (step=0837800) Train Loss: 0.1496, Train Steps/Sec: 10.96, Grad Norm: 0.0062 +[2025-02-20 11:56:42] (step=0837900) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 11:56:51] (step=0838000) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:57:00] (step=0838100) Train Loss: 0.1481, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 11:57:10] (step=0838200) Train Loss: 0.1490, Train Steps/Sec: 9.94, Grad Norm: 0.0060 +[2025-02-20 11:57:18] (step=0838300) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 11:57:27] (step=0838400) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:57:36] (step=0838500) Train Loss: 0.1512, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 11:57:45] (step=0838600) Train Loss: 0.1499, Train Steps/Sec: 11.22, Grad Norm: 0.0059 +[2025-02-20 11:57:53] (step=0838700) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 11:58:02] (step=0838800) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 11:58:10] (step=0838900) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 11:58:19] (step=0839000) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 11:58:28] (step=0839100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 11:58:36] (step=0839200) Train Loss: 0.1495, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 11:58:45] (step=0839300) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 11:58:54] (step=0839400) Train Loss: 0.1487, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 11:59:04] (step=0839500) Train Loss: 0.1492, Train Steps/Sec: 9.97, Grad Norm: 0.0059 +[2025-02-20 11:59:12] (step=0839600) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 11:59:21] (step=0839700) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 11:59:30] (step=0839800) Train Loss: 0.1503, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 11:59:40] (step=0839900) Train Loss: 0.1497, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-20 11:59:48] (step=0840000) Train Loss: 0.1494, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 11:59:49] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0840000.pt +[2025-02-20 11:59:58] (step=0840100) Train Loss: 0.1498, Train Steps/Sec: 10.36, Grad Norm: 0.0060 +[2025-02-20 12:00:07] (step=0840200) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:00:15] (step=0840300) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:00:24] (step=0840400) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:00:33] (step=0840500) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:00:41] (step=0840600) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:00:52] (step=0840700) Train Loss: 0.1493, Train Steps/Sec: 9.42, Grad Norm: 0.0060 +[2025-02-20 12:01:00] (step=0840800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 12:01:09] (step=0840900) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:01:18] (step=0841000) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 12:01:26] (step=0841100) Train Loss: 0.1483, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 12:01:35] (step=0841200) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:01:44] (step=0841300) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 12:01:52] (step=0841400) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:02:01] (step=0841500) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:02:09] (step=0841600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:02:18] (step=0841700) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:02:27] (step=0841800) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:02:35] (step=0841900) Train Loss: 0.1490, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 12:02:46] (step=0842000) Train Loss: 0.1508, Train Steps/Sec: 8.94, Grad Norm: 0.0064 +[2025-02-20 12:02:56] (step=0842100) Train Loss: 0.1508, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-20 12:03:04] (step=0842200) Train Loss: 0.1480, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:03:13] (step=0842300) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:03:21] (step=0842400) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:03:30] (step=0842500) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:03:39] (step=0842600) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:03:47] (step=0842700) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:03:56] (step=0842800) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:04:05] (step=0842900) Train Loss: 0.1495, Train Steps/Sec: 10.86, Grad Norm: 0.0061 +[2025-02-20 12:04:14] (step=0843000) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:04:22] (step=0843100) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:04:32] (step=0843200) Train Loss: 0.1503, Train Steps/Sec: 9.99, Grad Norm: 0.0061 +[2025-02-20 12:04:41] (step=0843300) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:04:50] (step=0843400) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 12:04:58] (step=0843500) Train Loss: 0.1501, Train Steps/Sec: 11.47, Grad Norm: 0.0063 +[2025-02-20 12:05:07] (step=0843600) Train Loss: 0.1495, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-20 12:05:16] (step=0843700) Train Loss: 0.1493, Train Steps/Sec: 11.42, Grad Norm: 0.0061 +[2025-02-20 12:05:25] (step=0843800) Train Loss: 0.1488, Train Steps/Sec: 11.42, Grad Norm: 0.0061 +[2025-02-20 12:05:33] (step=0843900) Train Loss: 0.1496, Train Steps/Sec: 11.42, Grad Norm: 0.0061 +[2025-02-20 12:05:42] (step=0844000) Train Loss: 0.1505, Train Steps/Sec: 11.42, Grad Norm: 0.0061 +[2025-02-20 12:05:51] (step=0844100) Train Loss: 0.1490, Train Steps/Sec: 11.09, Grad Norm: 0.0061 +[2025-02-20 12:06:01] (step=0844200) Train Loss: 0.1492, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-20 12:06:10] (step=0844300) Train Loss: 0.1502, Train Steps/Sec: 10.78, Grad Norm: 0.0059 +[2025-02-20 12:06:19] (step=0844400) Train Loss: 0.1490, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-20 12:06:29] (step=0844500) Train Loss: 0.1486, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-20 12:06:37] (step=0844600) Train Loss: 0.1481, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:06:46] (step=0844700) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:06:55] (step=0844800) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:07:03] (step=0844900) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:07:12] (step=0845000) Train Loss: 0.1499, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 12:07:21] (step=0845100) Train Loss: 0.1511, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 12:07:30] (step=0845200) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:07:38] (step=0845300) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:07:47] (step=0845400) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:07:56] (step=0845500) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:08:04] (step=0845600) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:08:14] (step=0845700) Train Loss: 0.1510, Train Steps/Sec: 9.99, Grad Norm: 0.0061 +[2025-02-20 12:08:23] (step=0845800) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:08:31] (step=0845900) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:08:40] (step=0846000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 12:08:49] (step=0846100) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:08:57] (step=0846200) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:09:06] (step=0846300) Train Loss: 0.1501, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-20 12:09:16] (step=0846400) Train Loss: 0.1506, Train Steps/Sec: 10.87, Grad Norm: 0.0059 +[2025-02-20 12:09:25] (step=0846500) Train Loss: 0.1507, Train Steps/Sec: 10.94, Grad Norm: 0.0058 +[2025-02-20 12:09:33] (step=0846600) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:09:42] (step=0846700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:09:51] (step=0846800) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 12:09:59] (step=0846900) Train Loss: 0.1508, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 12:10:09] (step=0847000) Train Loss: 0.1482, Train Steps/Sec: 10.02, Grad Norm: 0.0059 +[2025-02-20 12:10:18] (step=0847100) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:10:27] (step=0847200) Train Loss: 0.1517, Train Steps/Sec: 11.28, Grad Norm: 0.0064 +[2025-02-20 12:10:36] (step=0847300) Train Loss: 0.1500, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 12:10:44] (step=0847400) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 12:10:53] (step=0847500) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 12:11:01] (step=0847600) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:11:10] (step=0847700) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:11:19] (step=0847800) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:11:27] (step=0847900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:11:36] (step=0848000) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:11:44] (step=0848100) Train Loss: 0.1485, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:11:54] (step=0848200) Train Loss: 0.1511, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 12:12:03] (step=0848300) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 12:12:12] (step=0848400) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 12:12:21] (step=0848500) Train Loss: 0.1493, Train Steps/Sec: 10.54, Grad Norm: 0.0059 +[2025-02-20 12:12:31] (step=0848600) Train Loss: 0.1500, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-20 12:12:39] (step=0848700) Train Loss: 0.1513, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 12:12:48] (step=0848800) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 12:12:57] (step=0848900) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 12:13:05] (step=0849000) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:13:14] (step=0849100) Train Loss: 0.1511, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 12:13:22] (step=0849200) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 12:13:31] (step=0849300) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 12:13:40] (step=0849400) Train Loss: 0.1489, Train Steps/Sec: 10.82, Grad Norm: 0.0060 +[2025-02-20 12:13:50] (step=0849500) Train Loss: 0.1497, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 12:13:59] (step=0849600) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:14:08] (step=0849700) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 12:14:16] (step=0849800) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 12:14:25] (step=0849900) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 12:14:33] (step=0850000) Train Loss: 0.1485, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 12:14:42] (step=0850100) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 12:14:51] (step=0850200) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 12:14:59] (step=0850300) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:15:08] (step=0850400) Train Loss: 0.1486, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 12:15:16] (step=0850500) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 12:15:25] (step=0850600) Train Loss: 0.1495, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 12:15:36] (step=0850700) Train Loss: 0.1499, Train Steps/Sec: 9.20, Grad Norm: 0.0060 +[2025-02-20 12:15:45] (step=0850800) Train Loss: 0.1497, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-20 12:15:54] (step=0850900) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:16:02] (step=0851000) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 12:16:11] (step=0851100) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 12:16:20] (step=0851200) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:16:28] (step=0851300) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 12:16:37] (step=0851400) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 12:16:45] (step=0851500) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:16:55] (step=0851600) Train Loss: 0.1491, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 12:17:03] (step=0851700) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 12:17:12] (step=0851800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:17:20] (step=0851900) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 12:17:30] (step=0852000) Train Loss: 0.1496, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 12:17:39] (step=0852100) Train Loss: 0.1512, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 12:17:48] (step=0852200) Train Loss: 0.1485, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 12:17:56] (step=0852300) Train Loss: 0.1500, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 12:18:05] (step=0852400) Train Loss: 0.1495, Train Steps/Sec: 11.57, Grad Norm: 0.0065 +[2025-02-20 12:18:14] (step=0852500) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 12:18:22] (step=0852600) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 12:18:31] (step=0852700) Train Loss: 0.1485, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 12:18:40] (step=0852800) Train Loss: 0.1492, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 12:18:49] (step=0852900) Train Loss: 0.1495, Train Steps/Sec: 10.50, Grad Norm: 0.0062 +[2025-02-20 12:18:58] (step=0853000) Train Loss: 0.1502, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-20 12:19:07] (step=0853100) Train Loss: 0.1495, Train Steps/Sec: 11.54, Grad Norm: 0.0061 +[2025-02-20 12:19:17] (step=0853200) Train Loss: 0.1497, Train Steps/Sec: 9.95, Grad Norm: 0.0059 +[2025-02-20 12:19:26] (step=0853300) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:19:34] (step=0853400) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:19:43] (step=0853500) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:19:52] (step=0853600) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:20:01] (step=0853700) Train Loss: 0.1501, Train Steps/Sec: 10.91, Grad Norm: 0.0062 +[2025-02-20 12:20:09] (step=0853800) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 12:20:18] (step=0853900) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 12:20:27] (step=0854000) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 12:20:35] (step=0854100) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:20:44] (step=0854200) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:20:53] (step=0854300) Train Loss: 0.1519, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:21:01] (step=0854400) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 12:21:11] (step=0854500) Train Loss: 0.1508, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 12:21:20] (step=0854600) Train Loss: 0.1488, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 12:21:28] (step=0854700) Train Loss: 0.1480, Train Steps/Sec: 11.57, Grad Norm: 0.0058 +[2025-02-20 12:21:37] (step=0854800) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 12:21:46] (step=0854900) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:21:55] (step=0855000) Train Loss: 0.1507, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 12:22:04] (step=0855100) Train Loss: 0.1499, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 12:22:13] (step=0855200) Train Loss: 0.1487, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-20 12:22:22] (step=0855300) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:22:30] (step=0855400) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 12:22:39] (step=0855500) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 12:22:48] (step=0855600) Train Loss: 0.1496, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 12:22:58] (step=0855700) Train Loss: 0.1497, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-20 12:23:06] (step=0855800) Train Loss: 0.1503, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:23:16] (step=0855900) Train Loss: 0.1508, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-20 12:23:24] (step=0856000) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:23:33] (step=0856100) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 12:23:41] (step=0856200) Train Loss: 0.1486, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 12:23:50] (step=0856300) Train Loss: 0.1495, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 12:23:59] (step=0856400) Train Loss: 0.1494, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-20 12:24:07] (step=0856500) Train Loss: 0.1499, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 12:24:16] (step=0856600) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 12:24:25] (step=0856700) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 12:24:33] (step=0856800) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0057 +[2025-02-20 12:24:42] (step=0856900) Train Loss: 0.1514, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 12:24:52] (step=0857000) Train Loss: 0.1486, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-20 12:25:01] (step=0857100) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 12:25:10] (step=0857200) Train Loss: 0.1508, Train Steps/Sec: 10.46, Grad Norm: 0.0062 +[2025-02-20 12:25:20] (step=0857300) Train Loss: 0.1500, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 12:25:28] (step=0857400) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 12:25:37] (step=0857500) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:25:45] (step=0857600) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:25:54] (step=0857700) Train Loss: 0.1514, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:26:03] (step=0857800) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 12:26:11] (step=0857900) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:26:20] (step=0858000) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:26:29] (step=0858100) Train Loss: 0.1501, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 12:26:39] (step=0858200) Train Loss: 0.1503, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-20 12:26:48] (step=0858300) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:26:56] (step=0858400) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 12:27:05] (step=0858500) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 12:27:13] (step=0858600) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 12:27:22] (step=0858700) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:27:31] (step=0858800) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:27:39] (step=0858900) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 12:27:48] (step=0859000) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:27:56] (step=0859100) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:28:05] (step=0859200) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:28:14] (step=0859300) Train Loss: 0.1507, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 12:28:23] (step=0859400) Train Loss: 0.1494, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 12:28:34] (step=0859500) Train Loss: 0.1500, Train Steps/Sec: 9.55, Grad Norm: 0.0063 +[2025-02-20 12:28:42] (step=0859600) Train Loss: 0.1513, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:28:51] (step=0859700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 12:29:00] (step=0859800) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:29:08] (step=0859900) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 12:29:17] (step=0860000) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 12:29:18] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0860000.pt +[2025-02-20 12:29:27] (step=0860100) Train Loss: 0.1500, Train Steps/Sec: 10.34, Grad Norm: 0.0061 +[2025-02-20 12:29:35] (step=0860200) Train Loss: 0.1500, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 12:29:44] (step=0860300) Train Loss: 0.1497, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-20 12:29:53] (step=0860400) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:30:02] (step=0860500) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:30:10] (step=0860600) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:30:20] (step=0860700) Train Loss: 0.1491, Train Steps/Sec: 9.95, Grad Norm: 0.0059 +[2025-02-20 12:30:29] (step=0860800) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 12:30:37] (step=0860900) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 12:30:46] (step=0861000) Train Loss: 0.1494, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 12:30:55] (step=0861100) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 12:31:03] (step=0861200) Train Loss: 0.1500, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 12:31:12] (step=0861300) Train Loss: 0.1493, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 12:31:21] (step=0861400) Train Loss: 0.1512, Train Steps/Sec: 11.57, Grad Norm: 0.0058 +[2025-02-20 12:31:30] (step=0861500) Train Loss: 0.1494, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 12:31:39] (step=0861600) Train Loss: 0.1489, Train Steps/Sec: 10.52, Grad Norm: 0.0062 +[2025-02-20 12:31:48] (step=0861700) Train Loss: 0.1498, Train Steps/Sec: 10.89, Grad Norm: 0.0059 +[2025-02-20 12:31:57] (step=0861800) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 12:32:05] (step=0861900) Train Loss: 0.1498, Train Steps/Sec: 11.57, Grad Norm: 0.0058 +[2025-02-20 12:32:15] (step=0862000) Train Loss: 0.1517, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 12:32:24] (step=0862100) Train Loss: 0.1492, Train Steps/Sec: 11.57, Grad Norm: 0.0059 +[2025-02-20 12:32:33] (step=0862200) Train Loss: 0.1502, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 12:32:41] (step=0862300) Train Loss: 0.1484, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 12:32:51] (step=0862400) Train Loss: 0.1488, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 12:32:59] (step=0862500) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-20 12:33:08] (step=0862600) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:33:16] (step=0862700) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:33:25] (step=0862800) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 12:33:34] (step=0862900) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 12:33:42] (step=0863000) Train Loss: 0.1483, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:33:51] (step=0863100) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 12:34:01] (step=0863200) Train Loss: 0.1500, Train Steps/Sec: 9.99, Grad Norm: 0.0059 +[2025-02-20 12:34:09] (step=0863300) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 12:34:18] (step=0863400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 12:34:27] (step=0863500) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 12:34:35] (step=0863600) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:34:45] (step=0863700) Train Loss: 0.1511, Train Steps/Sec: 10.58, Grad Norm: 0.0060 +[2025-02-20 12:34:54] (step=0863800) Train Loss: 0.1494, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-20 12:35:03] (step=0863900) Train Loss: 0.1510, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 12:35:11] (step=0864000) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:35:20] (step=0864100) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 12:35:29] (step=0864200) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 12:35:37] (step=0864300) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 12:35:46] (step=0864400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:35:56] (step=0864500) Train Loss: 0.1495, Train Steps/Sec: 9.93, Grad Norm: 0.0061 +[2025-02-20 12:36:05] (step=0864600) Train Loss: 0.1495, Train Steps/Sec: 10.89, Grad Norm: 0.0059 +[2025-02-20 12:36:14] (step=0864700) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:36:22] (step=0864800) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:36:31] (step=0864900) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:36:40] (step=0865000) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:36:48] (step=0865100) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 12:36:57] (step=0865200) Train Loss: 0.1502, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:37:05] (step=0865300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:37:14] (step=0865400) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 12:37:23] (step=0865500) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:37:31] (step=0865600) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:37:41] (step=0865700) Train Loss: 0.1500, Train Steps/Sec: 9.99, Grad Norm: 0.0063 +[2025-02-20 12:37:50] (step=0865800) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 12:37:59] (step=0865900) Train Loss: 0.1487, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-20 12:38:09] (step=0866000) Train Loss: 0.1500, Train Steps/Sec: 10.58, Grad Norm: 0.0058 +[2025-02-20 12:38:17] (step=0866100) Train Loss: 0.1494, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 12:38:26] (step=0866200) Train Loss: 0.1484, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 12:38:35] (step=0866300) Train Loss: 0.1503, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 12:38:43] (step=0866400) Train Loss: 0.1504, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 12:38:52] (step=0866500) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0058 +[2025-02-20 12:39:01] (step=0866600) Train Loss: 0.1509, Train Steps/Sec: 11.57, Grad Norm: 0.0062 +[2025-02-20 12:39:10] (step=0866700) Train Loss: 0.1493, Train Steps/Sec: 11.20, Grad Norm: 0.0060 +[2025-02-20 12:39:18] (step=0866800) Train Loss: 0.1495, Train Steps/Sec: 11.22, Grad Norm: 0.0059 +[2025-02-20 12:39:27] (step=0866900) Train Loss: 0.1505, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 12:39:37] (step=0867000) Train Loss: 0.1499, Train Steps/Sec: 10.00, Grad Norm: 0.0059 +[2025-02-20 12:39:46] (step=0867100) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:39:54] (step=0867200) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:40:03] (step=0867300) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:40:12] (step=0867400) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:40:20] (step=0867500) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:40:29] (step=0867600) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:40:37] (step=0867700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:40:46] (step=0867800) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:40:55] (step=0867900) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:41:04] (step=0868000) Train Loss: 0.1489, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 12:41:13] (step=0868100) Train Loss: 0.1498, Train Steps/Sec: 10.53, Grad Norm: 0.0062 +[2025-02-20 12:41:24] (step=0868200) Train Loss: 0.1498, Train Steps/Sec: 9.51, Grad Norm: 0.0065 +[2025-02-20 12:41:32] (step=0868300) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:41:41] (step=0868400) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:41:49] (step=0868500) Train Loss: 0.1483, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:41:58] (step=0868600) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:42:07] (step=0868700) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:42:15] (step=0868800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:42:24] (step=0868900) Train Loss: 0.1504, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 12:42:33] (step=0869000) Train Loss: 0.1488, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 12:42:42] (step=0869100) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 12:42:50] (step=0869200) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 12:42:59] (step=0869300) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 12:43:07] (step=0869400) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:43:17] (step=0869500) Train Loss: 0.1495, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-20 12:43:26] (step=0869600) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:43:35] (step=0869700) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:43:43] (step=0869800) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:43:52] (step=0869900) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 12:44:01] (step=0870000) Train Loss: 0.1479, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:44:09] (step=0870100) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:44:19] (step=0870200) Train Loss: 0.1503, Train Steps/Sec: 10.55, Grad Norm: 0.0059 +[2025-02-20 12:44:28] (step=0870300) Train Loss: 0.1496, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 12:44:37] (step=0870400) Train Loss: 0.1482, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-20 12:44:45] (step=0870500) Train Loss: 0.1490, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:44:54] (step=0870600) Train Loss: 0.1485, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:45:04] (step=0870700) Train Loss: 0.1493, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-20 12:45:13] (step=0870800) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 12:45:21] (step=0870900) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:45:30] (step=0871000) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 12:45:39] (step=0871100) Train Loss: 0.1493, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 12:45:48] (step=0871200) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:45:56] (step=0871300) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:46:05] (step=0871400) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 12:46:13] (step=0871500) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 12:46:22] (step=0871600) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:46:31] (step=0871700) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 12:46:39] (step=0871800) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:46:48] (step=0871900) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:46:58] (step=0872000) Train Loss: 0.1490, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-20 12:47:06] (step=0872100) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:47:15] (step=0872200) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:47:24] (step=0872300) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:47:33] (step=0872400) Train Loss: 0.1489, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-20 12:47:42] (step=0872500) Train Loss: 0.1508, Train Steps/Sec: 10.90, Grad Norm: 0.0058 +[2025-02-20 12:47:51] (step=0872600) Train Loss: 0.1505, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 12:48:00] (step=0872700) Train Loss: 0.1484, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 12:48:08] (step=0872800) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:48:17] (step=0872900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 12:48:26] (step=0873000) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:48:34] (step=0873100) Train Loss: 0.1512, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:48:44] (step=0873200) Train Loss: 0.1492, Train Steps/Sec: 9.93, Grad Norm: 0.0061 +[2025-02-20 12:48:54] (step=0873300) Train Loss: 0.1488, Train Steps/Sec: 10.88, Grad Norm: 0.0063 +[2025-02-20 12:49:02] (step=0873400) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:49:11] (step=0873500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:49:19] (step=0873600) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 12:49:28] (step=0873700) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:49:37] (step=0873800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:49:45] (step=0873900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:49:54] (step=0874000) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:50:02] (step=0874100) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:50:11] (step=0874200) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 12:50:20] (step=0874300) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:50:28] (step=0874400) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:50:38] (step=0874500) Train Loss: 0.1496, Train Steps/Sec: 9.99, Grad Norm: 0.0061 +[2025-02-20 12:50:48] (step=0874600) Train Loss: 0.1498, Train Steps/Sec: 10.23, Grad Norm: 0.0058 +[2025-02-20 12:50:57] (step=0874700) Train Loss: 0.1500, Train Steps/Sec: 10.94, Grad Norm: 0.0058 +[2025-02-20 12:51:06] (step=0874800) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:51:14] (step=0874900) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:51:23] (step=0875000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 12:51:32] (step=0875100) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:51:40] (step=0875200) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:51:49] (step=0875300) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:51:58] (step=0875400) Train Loss: 0.1506, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 12:52:07] (step=0875500) Train Loss: 0.1505, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 12:52:15] (step=0875600) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:52:25] (step=0875700) Train Loss: 0.1497, Train Steps/Sec: 10.82, Grad Norm: 0.0064 +[2025-02-20 12:52:34] (step=0875800) Train Loss: 0.1494, Train Steps/Sec: 10.60, Grad Norm: 0.0062 +[2025-02-20 12:52:43] (step=0875900) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:52:51] (step=0876000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 12:53:00] (step=0876100) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 12:53:08] (step=0876200) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:53:17] (step=0876300) Train Loss: 0.1486, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 12:53:26] (step=0876400) Train Loss: 0.1515, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:53:34] (step=0876500) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:53:43] (step=0876600) Train Loss: 0.1483, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 12:53:52] (step=0876700) Train Loss: 0.1491, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-20 12:54:01] (step=0876800) Train Loss: 0.1499, Train Steps/Sec: 10.87, Grad Norm: 0.0060 +[2025-02-20 12:54:10] (step=0876900) Train Loss: 0.1489, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-20 12:54:20] (step=0877000) Train Loss: 0.1516, Train Steps/Sec: 10.01, Grad Norm: 0.0062 +[2025-02-20 12:54:29] (step=0877100) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:54:38] (step=0877200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:54:46] (step=0877300) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:54:55] (step=0877400) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:55:03] (step=0877500) Train Loss: 0.1482, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:55:12] (step=0877600) Train Loss: 0.1509, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 12:55:21] (step=0877700) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 12:55:30] (step=0877800) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 12:55:39] (step=0877900) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 12:55:47] (step=0878000) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:55:56] (step=0878100) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 12:56:04] (step=0878200) Train Loss: 0.1489, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 12:56:14] (step=0878300) Train Loss: 0.1501, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 12:56:23] (step=0878400) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:56:32] (step=0878500) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:56:40] (step=0878600) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:56:49] (step=0878700) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:56:57] (step=0878800) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:57:07] (step=0878900) Train Loss: 0.1497, Train Steps/Sec: 10.57, Grad Norm: 0.0059 +[2025-02-20 12:57:16] (step=0879000) Train Loss: 0.1499, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 12:57:25] (step=0879100) Train Loss: 0.1490, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-20 12:57:34] (step=0879200) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 12:57:42] (step=0879300) Train Loss: 0.1501, Train Steps/Sec: 11.59, Grad Norm: 0.0062 +[2025-02-20 12:57:51] (step=0879400) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0058 +[2025-02-20 12:58:01] (step=0879500) Train Loss: 0.1501, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-20 12:58:09] (step=0879600) Train Loss: 0.1484, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 12:58:18] (step=0879700) Train Loss: 0.1516, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-20 12:58:27] (step=0879800) Train Loss: 0.1508, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-20 12:58:36] (step=0879900) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:58:45] (step=0880000) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 12:58:45] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0880000.pt +[2025-02-20 12:58:54] (step=0880100) Train Loss: 0.1492, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-20 12:59:03] (step=0880200) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:59:11] (step=0880300) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:59:20] (step=0880400) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 12:59:28] (step=0880500) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 12:59:37] (step=0880600) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 12:59:46] (step=0880700) Train Loss: 0.1493, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 12:59:56] (step=0880800) Train Loss: 0.1502, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 13:00:04] (step=0880900) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:00:13] (step=0881000) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 13:00:22] (step=0881100) Train Loss: 0.1495, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-20 13:00:31] (step=0881200) Train Loss: 0.1503, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 13:00:40] (step=0881300) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 13:00:49] (step=0881400) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:00:58] (step=0881500) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:01:06] (step=0881600) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:01:15] (step=0881700) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 13:01:23] (step=0881800) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:01:32] (step=0881900) Train Loss: 0.1498, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 13:01:43] (step=0882000) Train Loss: 0.1508, Train Steps/Sec: 9.75, Grad Norm: 0.0061 +[2025-02-20 13:01:51] (step=0882100) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 13:02:00] (step=0882200) Train Loss: 0.1477, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 13:02:08] (step=0882300) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 13:02:17] (step=0882400) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 13:02:26] (step=0882500) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:02:34] (step=0882600) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:02:43] (step=0882700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 13:02:51] (step=0882800) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:03:00] (step=0882900) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:03:09] (step=0883000) Train Loss: 0.1493, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 13:03:17] (step=0883100) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:03:26] (step=0883200) Train Loss: 0.1486, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 13:03:37] (step=0883300) Train Loss: 0.1488, Train Steps/Sec: 8.95, Grad Norm: 0.0061 +[2025-02-20 13:03:46] (step=0883400) Train Loss: 0.1507, Train Steps/Sec: 10.95, Grad Norm: 0.0064 +[2025-02-20 13:03:55] (step=0883500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:04:03] (step=0883600) Train Loss: 0.1508, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 13:04:12] (step=0883700) Train Loss: 0.1488, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:04:21] (step=0883800) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:04:29] (step=0883900) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:04:38] (step=0884000) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:04:47] (step=0884100) Train Loss: 0.1519, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 13:04:56] (step=0884200) Train Loss: 0.1495, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-20 13:05:04] (step=0884300) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:05:13] (step=0884400) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:05:23] (step=0884500) Train Loss: 0.1494, Train Steps/Sec: 9.96, Grad Norm: 0.0058 +[2025-02-20 13:05:31] (step=0884600) Train Loss: 0.1499, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 13:05:40] (step=0884700) Train Loss: 0.1493, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 13:05:49] (step=0884800) Train Loss: 0.1484, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:05:57] (step=0884900) Train Loss: 0.1505, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 13:06:06] (step=0885000) Train Loss: 0.1510, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 13:06:14] (step=0885100) Train Loss: 0.1486, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 13:06:23] (step=0885200) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:06:32] (step=0885300) Train Loss: 0.1506, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 13:06:40] (step=0885400) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 13:06:50] (step=0885500) Train Loss: 0.1498, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-20 13:06:59] (step=0885600) Train Loss: 0.1500, Train Steps/Sec: 10.97, Grad Norm: 0.0060 +[2025-02-20 13:07:07] (step=0885700) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 13:07:17] (step=0885800) Train Loss: 0.1508, Train Steps/Sec: 10.06, Grad Norm: 0.0059 +[2025-02-20 13:07:26] (step=0885900) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:07:34] (step=0886000) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:07:43] (step=0886100) Train Loss: 0.1519, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:07:52] (step=0886200) Train Loss: 0.1492, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 13:08:00] (step=0886300) Train Loss: 0.1479, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:08:09] (step=0886400) Train Loss: 0.1503, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-20 13:08:18] (step=0886500) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:08:27] (step=0886600) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:08:35] (step=0886700) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:08:44] (step=0886800) Train Loss: 0.1490, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 13:08:52] (step=0886900) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 13:09:02] (step=0887000) Train Loss: 0.1491, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 13:09:11] (step=0887100) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:09:19] (step=0887200) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:09:28] (step=0887300) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:09:37] (step=0887400) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 13:09:45] (step=0887500) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 13:09:55] (step=0887600) Train Loss: 0.1495, Train Steps/Sec: 10.59, Grad Norm: 0.0060 +[2025-02-20 13:10:04] (step=0887700) Train Loss: 0.1504, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-20 13:10:13] (step=0887800) Train Loss: 0.1491, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-20 13:10:21] (step=0887900) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:10:30] (step=0888000) Train Loss: 0.1509, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:10:39] (step=0888100) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:10:47] (step=0888200) Train Loss: 0.1483, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 13:10:57] (step=0888300) Train Loss: 0.1513, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-20 13:11:06] (step=0888400) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-20 13:11:15] (step=0888500) Train Loss: 0.1496, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 13:11:24] (step=0888600) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 13:11:32] (step=0888700) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:11:41] (step=0888800) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:11:49] (step=0888900) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 13:11:58] (step=0889000) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 13:12:07] (step=0889100) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 13:12:15] (step=0889200) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:12:24] (step=0889300) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 13:12:32] (step=0889400) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 13:12:42] (step=0889500) Train Loss: 0.1495, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 13:12:51] (step=0889600) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-20 13:13:00] (step=0889700) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:13:09] (step=0889800) Train Loss: 0.1500, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-20 13:13:18] (step=0889900) Train Loss: 0.1491, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-20 13:13:27] (step=0890000) Train Loss: 0.1492, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 13:13:35] (step=0890100) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:13:44] (step=0890200) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:13:53] (step=0890300) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 13:14:01] (step=0890400) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:14:10] (step=0890500) Train Loss: 0.1512, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:14:19] (step=0890600) Train Loss: 0.1507, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 13:14:28] (step=0890700) Train Loss: 0.1503, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 13:14:37] (step=0890800) Train Loss: 0.1511, Train Steps/Sec: 10.09, Grad Norm: 0.0061 +[2025-02-20 13:14:46] (step=0890900) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 13:14:55] (step=0891000) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:15:03] (step=0891100) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:15:12] (step=0891200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:15:20] (step=0891300) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:15:29] (step=0891400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:15:38] (step=0891500) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:15:46] (step=0891600) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:15:55] (step=0891700) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:16:03] (step=0891800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:16:12] (step=0891900) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:16:23] (step=0892000) Train Loss: 0.1510, Train Steps/Sec: 8.97, Grad Norm: 0.0061 +[2025-02-20 13:16:32] (step=0892100) Train Loss: 0.1501, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-20 13:16:41] (step=0892200) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:16:50] (step=0892300) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 13:16:58] (step=0892400) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:17:07] (step=0892500) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:17:15] (step=0892600) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:17:24] (step=0892700) Train Loss: 0.1493, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 13:17:33] (step=0892800) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 13:17:42] (step=0892900) Train Loss: 0.1486, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 13:17:50] (step=0893000) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 13:17:59] (step=0893100) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0064 +[2025-02-20 13:18:08] (step=0893200) Train Loss: 0.1487, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 13:18:18] (step=0893300) Train Loss: 0.1508, Train Steps/Sec: 10.01, Grad Norm: 0.0059 +[2025-02-20 13:18:26] (step=0893400) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:18:35] (step=0893500) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:18:43] (step=0893600) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 13:18:52] (step=0893700) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 13:19:01] (step=0893800) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:19:09] (step=0893900) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:19:18] (step=0894000) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:19:27] (step=0894100) Train Loss: 0.1496, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-20 13:19:36] (step=0894200) Train Loss: 0.1505, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-20 13:19:45] (step=0894300) Train Loss: 0.1512, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 13:19:54] (step=0894400) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:20:04] (step=0894500) Train Loss: 0.1489, Train Steps/Sec: 10.03, Grad Norm: 0.0063 +[2025-02-20 13:20:13] (step=0894600) Train Loss: 0.1486, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 13:20:21] (step=0894700) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:20:30] (step=0894800) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:20:39] (step=0894900) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 13:20:47] (step=0895000) Train Loss: 0.1493, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 13:20:56] (step=0895100) Train Loss: 0.1493, Train Steps/Sec: 11.29, Grad Norm: 0.0065 +[2025-02-20 13:21:05] (step=0895200) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:21:13] (step=0895300) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:21:22] (step=0895400) Train Loss: 0.1486, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 13:21:30] (step=0895500) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 13:21:39] (step=0895600) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:21:48] (step=0895700) Train Loss: 0.1488, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 13:21:58] (step=0895800) Train Loss: 0.1510, Train Steps/Sec: 10.11, Grad Norm: 0.0062 +[2025-02-20 13:22:06] (step=0895900) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:22:15] (step=0896000) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:22:23] (step=0896100) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 13:22:32] (step=0896200) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:22:41] (step=0896300) Train Loss: 0.1497, Train Steps/Sec: 10.59, Grad Norm: 0.0058 +[2025-02-20 13:22:51] (step=0896400) Train Loss: 0.1489, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 13:22:59] (step=0896500) Train Loss: 0.1492, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 13:23:08] (step=0896600) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:23:17] (step=0896700) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:23:25] (step=0896800) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 13:23:34] (step=0896900) Train Loss: 0.1485, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:23:44] (step=0897000) Train Loss: 0.1491, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-20 13:23:53] (step=0897100) Train Loss: 0.1490, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 13:24:02] (step=0897200) Train Loss: 0.1499, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 13:24:10] (step=0897300) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 13:24:19] (step=0897400) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0065 +[2025-02-20 13:24:27] (step=0897500) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 13:24:36] (step=0897600) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 13:24:45] (step=0897700) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 13:24:53] (step=0897800) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 13:25:02] (step=0897900) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:25:10] (step=0898000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:25:19] (step=0898100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:25:28] (step=0898200) Train Loss: 0.1499, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 13:25:38] (step=0898300) Train Loss: 0.1502, Train Steps/Sec: 10.07, Grad Norm: 0.0061 +[2025-02-20 13:25:46] (step=0898400) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:25:56] (step=0898500) Train Loss: 0.1504, Train Steps/Sec: 10.27, Grad Norm: 0.0063 +[2025-02-20 13:26:05] (step=0898600) Train Loss: 0.1505, Train Steps/Sec: 10.97, Grad Norm: 0.0063 +[2025-02-20 13:26:14] (step=0898700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:26:22] (step=0898800) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 13:26:31] (step=0898900) Train Loss: 0.1504, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:26:39] (step=0899000) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:26:48] (step=0899100) Train Loss: 0.1485, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:26:57] (step=0899200) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:27:06] (step=0899300) Train Loss: 0.1503, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 13:27:14] (step=0899400) Train Loss: 0.1495, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-20 13:27:24] (step=0899500) Train Loss: 0.1495, Train Steps/Sec: 9.99, Grad Norm: 0.0058 +[2025-02-20 13:27:33] (step=0899600) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:27:42] (step=0899700) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:27:50] (step=0899800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:27:59] (step=0899900) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:28:07] (step=0900000) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:28:08] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0900000.pt +[2025-02-20 13:28:17] (step=0900100) Train Loss: 0.1505, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-20 13:28:25] (step=0900200) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:28:34] (step=0900300) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 13:28:43] (step=0900400) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:28:51] (step=0900500) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:29:00] (step=0900600) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:29:10] (step=0900700) Train Loss: 0.1506, Train Steps/Sec: 10.24, Grad Norm: 0.0061 +[2025-02-20 13:29:20] (step=0900800) Train Loss: 0.1499, Train Steps/Sec: 9.55, Grad Norm: 0.0060 +[2025-02-20 13:29:29] (step=0900900) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:29:37] (step=0901000) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:29:46] (step=0901100) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:29:54] (step=0901200) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:30:03] (step=0901300) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:30:12] (step=0901400) Train Loss: 0.1509, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 13:30:21] (step=0901500) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:30:29] (step=0901600) Train Loss: 0.1495, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-20 13:30:38] (step=0901700) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:30:47] (step=0901800) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:30:55] (step=0901900) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 13:31:05] (step=0902000) Train Loss: 0.1498, Train Steps/Sec: 9.98, Grad Norm: 0.0061 +[2025-02-20 13:31:14] (step=0902100) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:31:22] (step=0902200) Train Loss: 0.1495, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:31:31] (step=0902300) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:31:40] (step=0902400) Train Loss: 0.1492, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:31:48] (step=0902500) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:31:57] (step=0902600) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 13:32:05] (step=0902700) Train Loss: 0.1485, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:32:15] (step=0902800) Train Loss: 0.1492, Train Steps/Sec: 10.59, Grad Norm: 0.0060 +[2025-02-20 13:32:24] (step=0902900) Train Loss: 0.1508, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 13:32:33] (step=0903000) Train Loss: 0.1492, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-20 13:32:42] (step=0903100) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:32:50] (step=0903200) Train Loss: 0.1505, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 13:33:00] (step=0903300) Train Loss: 0.1496, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-20 13:33:09] (step=0903400) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 13:33:18] (step=0903500) Train Loss: 0.1491, Train Steps/Sec: 11.21, Grad Norm: 0.0057 +[2025-02-20 13:33:26] (step=0903600) Train Loss: 0.1511, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:33:35] (step=0903700) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 13:33:44] (step=0903800) Train Loss: 0.1499, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-20 13:33:52] (step=0903900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:34:01] (step=0904000) Train Loss: 0.1509, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 13:34:10] (step=0904100) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:34:18] (step=0904200) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 13:34:27] (step=0904300) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:34:36] (step=0904400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:34:46] (step=0904500) Train Loss: 0.1505, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 13:34:54] (step=0904600) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:35:03] (step=0904700) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:35:11] (step=0904800) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 13:35:20] (step=0904900) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:35:29] (step=0905000) Train Loss: 0.1496, Train Steps/Sec: 10.58, Grad Norm: 0.0061 +[2025-02-20 13:35:39] (step=0905100) Train Loss: 0.1496, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 13:35:47] (step=0905200) Train Loss: 0.1498, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 13:35:56] (step=0905300) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:36:05] (step=0905400) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:36:13] (step=0905500) Train Loss: 0.1480, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:36:22] (step=0905600) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:36:31] (step=0905700) Train Loss: 0.1509, Train Steps/Sec: 11.20, Grad Norm: 0.0062 +[2025-02-20 13:36:41] (step=0905800) Train Loss: 0.1492, Train Steps/Sec: 10.07, Grad Norm: 0.0061 +[2025-02-20 13:36:50] (step=0905900) Train Loss: 0.1499, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 13:36:58] (step=0906000) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:37:07] (step=0906100) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:37:15] (step=0906200) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:37:24] (step=0906300) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:37:33] (step=0906400) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 13:37:41] (step=0906500) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:37:50] (step=0906600) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:37:58] (step=0906700) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:38:07] (step=0906800) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:38:16] (step=0906900) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:38:26] (step=0907000) Train Loss: 0.1502, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 13:38:34] (step=0907100) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:38:44] (step=0907200) Train Loss: 0.1508, Train Steps/Sec: 10.57, Grad Norm: 0.0059 +[2025-02-20 13:38:53] (step=0907300) Train Loss: 0.1484, Train Steps/Sec: 10.62, Grad Norm: 0.0061 +[2025-02-20 13:39:02] (step=0907400) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:39:10] (step=0907500) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 13:39:19] (step=0907600) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:39:27] (step=0907700) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:39:36] (step=0907800) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:39:45] (step=0907900) Train Loss: 0.1493, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 13:39:54] (step=0908000) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:40:02] (step=0908100) Train Loss: 0.1487, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-20 13:40:11] (step=0908200) Train Loss: 0.1496, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:40:21] (step=0908300) Train Loss: 0.1491, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-20 13:40:30] (step=0908400) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:40:38] (step=0908500) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 13:40:47] (step=0908600) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:40:55] (step=0908700) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:41:04] (step=0908800) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:41:13] (step=0908900) Train Loss: 0.1503, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:41:21] (step=0909000) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:41:30] (step=0909100) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 13:41:38] (step=0909200) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:41:47] (step=0909300) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:41:57] (step=0909400) Train Loss: 0.1503, Train Steps/Sec: 10.27, Grad Norm: 0.0064 +[2025-02-20 13:42:07] (step=0909500) Train Loss: 0.1498, Train Steps/Sec: 9.54, Grad Norm: 0.0058 +[2025-02-20 13:42:16] (step=0909600) Train Loss: 0.1492, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:42:24] (step=0909700) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:42:33] (step=0909800) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 13:42:42] (step=0909900) Train Loss: 0.1499, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:42:50] (step=0910000) Train Loss: 0.1494, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 13:42:59] (step=0910100) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:43:08] (step=0910200) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:43:16] (step=0910300) Train Loss: 0.1504, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-20 13:43:25] (step=0910400) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:43:34] (step=0910500) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:43:42] (step=0910600) Train Loss: 0.1496, Train Steps/Sec: 11.58, Grad Norm: 0.0064 +[2025-02-20 13:43:51] (step=0910700) Train Loss: 0.1497, Train Steps/Sec: 11.53, Grad Norm: 0.0060 +[2025-02-20 13:44:01] (step=0910800) Train Loss: 0.1499, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 13:44:10] (step=0910900) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 13:44:18] (step=0911000) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 13:44:27] (step=0911100) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:44:35] (step=0911200) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:44:44] (step=0911300) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:44:52] (step=0911400) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 13:45:02] (step=0911500) Train Loss: 0.1515, Train Steps/Sec: 10.61, Grad Norm: 0.0059 +[2025-02-20 13:45:11] (step=0911600) Train Loss: 0.1511, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-20 13:45:20] (step=0911700) Train Loss: 0.1496, Train Steps/Sec: 10.97, Grad Norm: 0.0063 +[2025-02-20 13:45:29] (step=0911800) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 13:45:37] (step=0911900) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:45:47] (step=0912000) Train Loss: 0.1491, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 13:45:56] (step=0912100) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:46:05] (step=0912200) Train Loss: 0.1501, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 13:46:13] (step=0912300) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:46:22] (step=0912400) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:46:31] (step=0912500) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-20 13:46:39] (step=0912600) Train Loss: 0.1516, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:46:48] (step=0912700) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:46:57] (step=0912800) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:47:05] (step=0912900) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 13:47:14] (step=0913000) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:47:22] (step=0913100) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 13:47:31] (step=0913200) Train Loss: 0.1497, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-20 13:47:41] (step=0913300) Train Loss: 0.1494, Train Steps/Sec: 10.00, Grad Norm: 0.0058 +[2025-02-20 13:47:50] (step=0913400) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:47:58] (step=0913500) Train Loss: 0.1482, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 13:48:07] (step=0913600) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:48:16] (step=0913700) Train Loss: 0.1496, Train Steps/Sec: 10.59, Grad Norm: 0.0059 +[2025-02-20 13:48:25] (step=0913800) Train Loss: 0.1497, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 13:48:34] (step=0913900) Train Loss: 0.1515, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 13:48:43] (step=0914000) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 13:48:52] (step=0914100) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:49:00] (step=0914200) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:49:09] (step=0914300) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:49:18] (step=0914400) Train Loss: 0.1492, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 13:49:28] (step=0914500) Train Loss: 0.1496, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-20 13:49:36] (step=0914600) Train Loss: 0.1503, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 13:49:45] (step=0914700) Train Loss: 0.1480, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:49:54] (step=0914800) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 13:50:02] (step=0914900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 13:50:11] (step=0915000) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 13:50:20] (step=0915100) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 13:50:28] (step=0915200) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 13:50:37] (step=0915300) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 13:50:45] (step=0915400) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 13:50:54] (step=0915500) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 13:51:03] (step=0915600) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 13:51:11] (step=0915700) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 13:51:21] (step=0915800) Train Loss: 0.1502, Train Steps/Sec: 10.07, Grad Norm: 0.0064 +[2025-02-20 13:51:31] (step=0915900) Train Loss: 0.1496, Train Steps/Sec: 10.26, Grad Norm: 0.0060 +[2025-02-20 13:51:40] (step=0916000) Train Loss: 0.1499, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-20 13:51:49] (step=0916100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 13:51:57] (step=0916200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:52:06] (step=0916300) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:52:15] (step=0916400) Train Loss: 0.1497, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:52:23] (step=0916500) Train Loss: 0.1500, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 13:52:32] (step=0916600) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:52:41] (step=0916700) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:52:49] (step=0916800) Train Loss: 0.1497, Train Steps/Sec: 11.29, Grad Norm: 0.0058 +[2025-02-20 13:52:58] (step=0916900) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:53:08] (step=0917000) Train Loss: 0.1490, Train Steps/Sec: 9.94, Grad Norm: 0.0060 +[2025-02-20 13:53:17] (step=0917100) Train Loss: 0.1503, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 13:53:25] (step=0917200) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:53:34] (step=0917300) Train Loss: 0.1493, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 13:53:42] (step=0917400) Train Loss: 0.1487, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:53:51] (step=0917500) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0064 +[2025-02-20 13:54:00] (step=0917600) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 13:54:08] (step=0917700) Train Loss: 0.1476, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:54:17] (step=0917800) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:54:25] (step=0917900) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 13:54:34] (step=0918000) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:54:44] (step=0918100) Train Loss: 0.1504, Train Steps/Sec: 10.25, Grad Norm: 0.0059 +[2025-02-20 13:54:53] (step=0918200) Train Loss: 0.1504, Train Steps/Sec: 10.97, Grad Norm: 0.0059 +[2025-02-20 13:55:03] (step=0918300) Train Loss: 0.1506, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 13:55:11] (step=0918400) Train Loss: 0.1494, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:55:20] (step=0918500) Train Loss: 0.1497, Train Steps/Sec: 11.66, Grad Norm: 0.0060 +[2025-02-20 13:55:29] (step=0918600) Train Loss: 0.1500, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 13:55:37] (step=0918700) Train Loss: 0.1509, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 13:55:46] (step=0918800) Train Loss: 0.1490, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 13:55:55] (step=0918900) Train Loss: 0.1497, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 13:56:03] (step=0919000) Train Loss: 0.1496, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-20 13:56:12] (step=0919100) Train Loss: 0.1493, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 13:56:21] (step=0919200) Train Loss: 0.1513, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-20 13:56:29] (step=0919300) Train Loss: 0.1505, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-20 13:56:38] (step=0919400) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-20 13:56:48] (step=0919500) Train Loss: 0.1493, Train Steps/Sec: 10.04, Grad Norm: 0.0059 +[2025-02-20 13:56:56] (step=0919600) Train Loss: 0.1496, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 13:57:05] (step=0919700) Train Loss: 0.1500, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 13:57:13] (step=0919800) Train Loss: 0.1494, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-20 13:57:22] (step=0919900) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 13:57:31] (step=0920000) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:57:32] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0920000.pt +[2025-02-20 13:57:40] (step=0920100) Train Loss: 0.1502, Train Steps/Sec: 10.57, Grad Norm: 0.0060 +[2025-02-20 13:57:50] (step=0920200) Train Loss: 0.1486, Train Steps/Sec: 10.57, Grad Norm: 0.0060 +[2025-02-20 13:57:58] (step=0920300) Train Loss: 0.1493, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 13:58:08] (step=0920400) Train Loss: 0.1500, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-20 13:58:16] (step=0920500) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 13:58:25] (step=0920600) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 13:58:33] (step=0920700) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 13:58:43] (step=0920800) Train Loss: 0.1497, Train Steps/Sec: 9.96, Grad Norm: 0.0060 +[2025-02-20 13:58:52] (step=0920900) Train Loss: 0.1503, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 13:59:01] (step=0921000) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 13:59:10] (step=0921100) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:59:18] (step=0921200) Train Loss: 0.1489, Train Steps/Sec: 11.28, Grad Norm: 0.0058 +[2025-02-20 13:59:27] (step=0921300) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 13:59:36] (step=0921400) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 13:59:44] (step=0921500) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 13:59:53] (step=0921600) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:00:01] (step=0921700) Train Loss: 0.1488, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:00:10] (step=0921800) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:00:19] (step=0921900) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 14:00:28] (step=0922000) Train Loss: 0.1494, Train Steps/Sec: 10.07, Grad Norm: 0.0063 +[2025-02-20 14:00:37] (step=0922100) Train Loss: 0.1510, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:00:46] (step=0922200) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 14:00:54] (step=0922300) Train Loss: 0.1492, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 14:01:04] (step=0922400) Train Loss: 0.1492, Train Steps/Sec: 10.59, Grad Norm: 0.0058 +[2025-02-20 14:01:13] (step=0922500) Train Loss: 0.1506, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-20 14:01:22] (step=0922600) Train Loss: 0.1494, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-20 14:01:30] (step=0922700) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 14:01:39] (step=0922800) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 14:01:47] (step=0922900) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 14:01:56] (step=0923000) Train Loss: 0.1507, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 14:02:05] (step=0923100) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:02:14] (step=0923200) Train Loss: 0.1513, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:02:24] (step=0923300) Train Loss: 0.1489, Train Steps/Sec: 9.75, Grad Norm: 0.0062 +[2025-02-20 14:02:32] (step=0923400) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 14:02:41] (step=0923500) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:02:50] (step=0923600) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:02:58] (step=0923700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:03:07] (step=0923800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:03:15] (step=0923900) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:03:24] (step=0924000) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:03:33] (step=0924100) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:03:41] (step=0924200) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:03:50] (step=0924300) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:03:58] (step=0924400) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:04:08] (step=0924500) Train Loss: 0.1499, Train Steps/Sec: 10.01, Grad Norm: 0.0059 +[2025-02-20 14:04:18] (step=0924600) Train Loss: 0.1484, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-20 14:04:27] (step=0924700) Train Loss: 0.1507, Train Steps/Sec: 10.97, Grad Norm: 0.0061 +[2025-02-20 14:04:36] (step=0924800) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:04:45] (step=0924900) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 14:04:53] (step=0925000) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:05:02] (step=0925100) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:05:11] (step=0925200) Train Loss: 0.1500, Train Steps/Sec: 11.22, Grad Norm: 0.0059 +[2025-02-20 14:05:19] (step=0925300) Train Loss: 0.1502, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 14:05:28] (step=0925400) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 14:05:37] (step=0925500) Train Loss: 0.1499, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-20 14:05:45] (step=0925600) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 14:05:54] (step=0925700) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 14:06:04] (step=0925800) Train Loss: 0.1502, Train Steps/Sec: 9.90, Grad Norm: 0.0060 +[2025-02-20 14:06:13] (step=0925900) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:06:21] (step=0926000) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:06:30] (step=0926100) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:06:39] (step=0926200) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:06:47] (step=0926300) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:06:56] (step=0926400) Train Loss: 0.1512, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:07:04] (step=0926500) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:07:13] (step=0926600) Train Loss: 0.1485, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:07:22] (step=0926700) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:07:31] (step=0926800) Train Loss: 0.1482, Train Steps/Sec: 10.24, Grad Norm: 0.0061 +[2025-02-20 14:07:41] (step=0926900) Train Loss: 0.1502, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 14:07:51] (step=0927000) Train Loss: 0.1496, Train Steps/Sec: 9.93, Grad Norm: 0.0060 +[2025-02-20 14:07:59] (step=0927100) Train Loss: 0.1479, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:08:08] (step=0927200) Train Loss: 0.1494, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 14:08:16] (step=0927300) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:08:25] (step=0927400) Train Loss: 0.1502, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 14:08:34] (step=0927500) Train Loss: 0.1510, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 14:08:43] (step=0927600) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:08:51] (step=0927700) Train Loss: 0.1491, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-20 14:09:00] (step=0927800) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 14:09:09] (step=0927900) Train Loss: 0.1483, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 14:09:17] (step=0928000) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 14:09:26] (step=0928100) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:09:35] (step=0928200) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:09:45] (step=0928300) Train Loss: 0.1491, Train Steps/Sec: 10.02, Grad Norm: 0.0058 +[2025-02-20 14:09:53] (step=0928400) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:10:02] (step=0928500) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:10:10] (step=0928600) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:10:19] (step=0928700) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:10:28] (step=0928800) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0067 +[2025-02-20 14:10:37] (step=0928900) Train Loss: 0.1490, Train Steps/Sec: 10.60, Grad Norm: 0.0065 +[2025-02-20 14:10:46] (step=0929000) Train Loss: 0.1493, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 14:10:55] (step=0929100) Train Loss: 0.1502, Train Steps/Sec: 10.96, Grad Norm: 0.0064 +[2025-02-20 14:11:04] (step=0929200) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:11:12] (step=0929300) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 14:11:21] (step=0929400) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:11:31] (step=0929500) Train Loss: 0.1489, Train Steps/Sec: 9.75, Grad Norm: 0.0063 +[2025-02-20 14:11:40] (step=0929600) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:11:48] (step=0929700) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:11:57] (step=0929800) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:12:06] (step=0929900) Train Loss: 0.1504, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 14:12:14] (step=0930000) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 14:12:23] (step=0930100) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:12:32] (step=0930200) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:12:40] (step=0930300) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:12:49] (step=0930400) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 14:12:57] (step=0930500) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:13:06] (step=0930600) Train Loss: 0.1484, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 14:13:15] (step=0930700) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:13:25] (step=0930800) Train Loss: 0.1490, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 14:13:33] (step=0930900) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:13:42] (step=0931000) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:13:52] (step=0931100) Train Loss: 0.1492, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-20 14:14:01] (step=0931200) Train Loss: 0.1500, Train Steps/Sec: 10.97, Grad Norm: 0.0059 +[2025-02-20 14:14:09] (step=0931300) Train Loss: 0.1483, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:14:18] (step=0931400) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:14:26] (step=0931500) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 14:14:35] (step=0931600) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:14:44] (step=0931700) Train Loss: 0.1508, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-20 14:14:52] (step=0931800) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:15:01] (step=0931900) Train Loss: 0.1500, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 14:15:11] (step=0932000) Train Loss: 0.1495, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 14:15:20] (step=0932100) Train Loss: 0.1491, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 14:15:29] (step=0932200) Train Loss: 0.1491, Train Steps/Sec: 11.46, Grad Norm: 0.0062 +[2025-02-20 14:15:37] (step=0932300) Train Loss: 0.1508, Train Steps/Sec: 11.45, Grad Norm: 0.0062 +[2025-02-20 14:15:46] (step=0932400) Train Loss: 0.1491, Train Steps/Sec: 11.47, Grad Norm: 0.0064 +[2025-02-20 14:15:55] (step=0932500) Train Loss: 0.1498, Train Steps/Sec: 11.51, Grad Norm: 0.0059 +[2025-02-20 14:16:03] (step=0932600) Train Loss: 0.1485, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:16:12] (step=0932700) Train Loss: 0.1486, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:16:21] (step=0932800) Train Loss: 0.1505, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 14:16:29] (step=0932900) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:16:38] (step=0933000) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 14:16:46] (step=0933100) Train Loss: 0.1496, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 14:16:55] (step=0933200) Train Loss: 0.1502, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-20 14:17:06] (step=0933300) Train Loss: 0.1506, Train Steps/Sec: 8.96, Grad Norm: 0.0062 +[2025-02-20 14:17:15] (step=0933400) Train Loss: 0.1504, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-20 14:17:24] (step=0933500) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:17:32] (step=0933600) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:17:41] (step=0933700) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 14:17:50] (step=0933800) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:17:59] (step=0933900) Train Loss: 0.1518, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 14:18:07] (step=0934000) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:18:16] (step=0934100) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:18:25] (step=0934200) Train Loss: 0.1496, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-20 14:18:33] (step=0934300) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:18:42] (step=0934400) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 14:18:52] (step=0934500) Train Loss: 0.1500, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-20 14:19:00] (step=0934600) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:19:09] (step=0934700) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:19:18] (step=0934800) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:19:26] (step=0934900) Train Loss: 0.1507, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:19:35] (step=0935000) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:19:43] (step=0935100) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:19:52] (step=0935200) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 14:20:01] (step=0935300) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:20:10] (step=0935400) Train Loss: 0.1517, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-20 14:20:19] (step=0935500) Train Loss: 0.1512, Train Steps/Sec: 10.89, Grad Norm: 0.0059 +[2025-02-20 14:20:28] (step=0935600) Train Loss: 0.1508, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-20 14:20:37] (step=0935700) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:20:47] (step=0935800) Train Loss: 0.1495, Train Steps/Sec: 9.95, Grad Norm: 0.0061 +[2025-02-20 14:20:55] (step=0935900) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:21:04] (step=0936000) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:21:13] (step=0936100) Train Loss: 0.1500, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-20 14:21:21] (step=0936200) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 14:21:30] (step=0936300) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 14:21:39] (step=0936400) Train Loss: 0.1497, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 14:21:48] (step=0936500) Train Loss: 0.1488, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 14:21:56] (step=0936600) Train Loss: 0.1505, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 14:22:05] (step=0936700) Train Loss: 0.1503, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 14:22:13] (step=0936800) Train Loss: 0.1498, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 14:22:22] (step=0936900) Train Loss: 0.1505, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 14:22:32] (step=0937000) Train Loss: 0.1504, Train Steps/Sec: 10.07, Grad Norm: 0.0061 +[2025-02-20 14:22:41] (step=0937100) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:22:49] (step=0937200) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:22:58] (step=0937300) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:23:06] (step=0937400) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:23:15] (step=0937500) Train Loss: 0.1488, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:23:24] (step=0937600) Train Loss: 0.1496, Train Steps/Sec: 10.59, Grad Norm: 0.0060 +[2025-02-20 14:23:34] (step=0937700) Train Loss: 0.1496, Train Steps/Sec: 10.92, Grad Norm: 0.0064 +[2025-02-20 14:23:42] (step=0937800) Train Loss: 0.1492, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 14:23:51] (step=0937900) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:24:00] (step=0938000) Train Loss: 0.1480, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:24:08] (step=0938100) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:24:17] (step=0938200) Train Loss: 0.1500, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 14:24:27] (step=0938300) Train Loss: 0.1507, Train Steps/Sec: 10.06, Grad Norm: 0.0060 +[2025-02-20 14:24:36] (step=0938400) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:24:44] (step=0938500) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:24:53] (step=0938600) Train Loss: 0.1493, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-20 14:25:02] (step=0938700) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:25:10] (step=0938800) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:25:19] (step=0938900) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:25:28] (step=0939000) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:25:36] (step=0939100) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 14:25:45] (step=0939200) Train Loss: 0.1519, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:25:53] (step=0939300) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 14:26:02] (step=0939400) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:26:11] (step=0939500) Train Loss: 0.1506, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 14:26:21] (step=0939600) Train Loss: 0.1495, Train Steps/Sec: 10.02, Grad Norm: 0.0072 +[2025-02-20 14:26:29] (step=0939700) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:26:39] (step=0939800) Train Loss: 0.1495, Train Steps/Sec: 10.56, Grad Norm: 0.0059 +[2025-02-20 14:26:48] (step=0939900) Train Loss: 0.1494, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 14:26:57] (step=0940000) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:26:57] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0940000.pt +[2025-02-20 14:27:06] (step=0940100) Train Loss: 0.1494, Train Steps/Sec: 10.81, Grad Norm: 0.0061 +[2025-02-20 14:27:15] (step=0940200) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:27:23] (step=0940300) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:27:32] (step=0940400) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-20 14:27:41] (step=0940500) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 14:27:49] (step=0940600) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:27:58] (step=0940700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:28:08] (step=0940800) Train Loss: 0.1496, Train Steps/Sec: 9.73, Grad Norm: 0.0060 +[2025-02-20 14:28:17] (step=0940900) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 14:28:25] (step=0941000) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 14:28:34] (step=0941100) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:28:43] (step=0941200) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:28:51] (step=0941300) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 14:29:00] (step=0941400) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:29:09] (step=0941500) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:29:17] (step=0941600) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:29:26] (step=0941700) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:29:34] (step=0941800) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:29:43] (step=0941900) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:29:53] (step=0942000) Train Loss: 0.1490, Train Steps/Sec: 10.23, Grad Norm: 0.0064 +[2025-02-20 14:30:03] (step=0942100) Train Loss: 0.1510, Train Steps/Sec: 9.51, Grad Norm: 0.0062 +[2025-02-20 14:30:12] (step=0942200) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:30:20] (step=0942300) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:30:29] (step=0942400) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:30:38] (step=0942500) Train Loss: 0.1502, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 14:30:47] (step=0942600) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:30:55] (step=0942700) Train Loss: 0.1507, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 14:31:04] (step=0942800) Train Loss: 0.1521, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 14:31:13] (step=0942900) Train Loss: 0.1494, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 14:31:21] (step=0943000) Train Loss: 0.1482, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:31:30] (step=0943100) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:31:38] (step=0943200) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:31:48] (step=0943300) Train Loss: 0.1492, Train Steps/Sec: 9.98, Grad Norm: 0.0059 +[2025-02-20 14:31:57] (step=0943400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:32:06] (step=0943500) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 14:32:14] (step=0943600) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:32:23] (step=0943700) Train Loss: 0.1491, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:32:32] (step=0943800) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:32:40] (step=0943900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:32:49] (step=0944000) Train Loss: 0.1483, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:32:58] (step=0944100) Train Loss: 0.1505, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-20 14:33:07] (step=0944200) Train Loss: 0.1496, Train Steps/Sec: 10.91, Grad Norm: 0.0064 +[2025-02-20 14:33:16] (step=0944300) Train Loss: 0.1498, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 14:33:25] (step=0944400) Train Loss: 0.1513, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 14:33:33] (step=0944500) Train Loss: 0.1507, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 14:33:43] (step=0944600) Train Loss: 0.1505, Train Steps/Sec: 10.02, Grad Norm: 0.0062 +[2025-02-20 14:33:52] (step=0944700) Train Loss: 0.1510, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-20 14:34:01] (step=0944800) Train Loss: 0.1496, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 14:34:10] (step=0944900) Train Loss: 0.1492, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 14:34:18] (step=0945000) Train Loss: 0.1485, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 14:34:27] (step=0945100) Train Loss: 0.1495, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 14:34:36] (step=0945200) Train Loss: 0.1494, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 14:34:44] (step=0945300) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 14:34:53] (step=0945400) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 14:35:02] (step=0945500) Train Loss: 0.1490, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 14:35:10] (step=0945600) Train Loss: 0.1507, Train Steps/Sec: 11.58, Grad Norm: 0.0061 +[2025-02-20 14:35:19] (step=0945700) Train Loss: 0.1502, Train Steps/Sec: 11.58, Grad Norm: 0.0060 +[2025-02-20 14:35:29] (step=0945800) Train Loss: 0.1507, Train Steps/Sec: 9.92, Grad Norm: 0.0060 +[2025-02-20 14:35:38] (step=0945900) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 14:35:46] (step=0946000) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:35:55] (step=0946100) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:36:03] (step=0946200) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 14:36:13] (step=0946300) Train Loss: 0.1491, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 14:36:22] (step=0946400) Train Loss: 0.1482, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-20 14:36:31] (step=0946500) Train Loss: 0.1502, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 14:36:40] (step=0946600) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:36:48] (step=0946700) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:36:57] (step=0946800) Train Loss: 0.1492, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:37:06] (step=0946900) Train Loss: 0.1496, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 14:37:14] (step=0947000) Train Loss: 0.1503, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 14:37:24] (step=0947100) Train Loss: 0.1491, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-20 14:37:33] (step=0947200) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:37:42] (step=0947300) Train Loss: 0.1498, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 14:37:50] (step=0947400) Train Loss: 0.1507, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:37:59] (step=0947500) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:38:08] (step=0947600) Train Loss: 0.1485, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 14:38:16] (step=0947700) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:38:25] (step=0947800) Train Loss: 0.1522, Train Steps/Sec: 11.62, Grad Norm: 0.0063 +[2025-02-20 14:38:33] (step=0947900) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:38:42] (step=0948000) Train Loss: 0.1481, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:38:51] (step=0948100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:38:59] (step=0948200) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:39:09] (step=0948300) Train Loss: 0.1505, Train Steps/Sec: 9.98, Grad Norm: 0.0064 +[2025-02-20 14:39:18] (step=0948400) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:39:28] (step=0948500) Train Loss: 0.1500, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-20 14:39:37] (step=0948600) Train Loss: 0.1496, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-20 14:39:45] (step=0948700) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 14:39:54] (step=0948800) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:40:03] (step=0948900) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:40:11] (step=0949000) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 14:40:20] (step=0949100) Train Loss: 0.1492, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 14:40:29] (step=0949200) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 14:40:37] (step=0949300) Train Loss: 0.1489, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 14:40:46] (step=0949400) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 14:40:55] (step=0949500) Train Loss: 0.1508, Train Steps/Sec: 11.23, Grad Norm: 0.0064 +[2025-02-20 14:41:05] (step=0949600) Train Loss: 0.1488, Train Steps/Sec: 10.08, Grad Norm: 0.0061 +[2025-02-20 14:41:13] (step=0949700) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:41:22] (step=0949800) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:41:31] (step=0949900) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 14:41:39] (step=0950000) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:41:48] (step=0950100) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:41:56] (step=0950200) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:42:05] (step=0950300) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:42:14] (step=0950400) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 14:42:22] (step=0950500) Train Loss: 0.1495, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 14:42:31] (step=0950600) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:42:41] (step=0950700) Train Loss: 0.1512, Train Steps/Sec: 10.26, Grad Norm: 0.0059 +[2025-02-20 14:42:51] (step=0950800) Train Loss: 0.1496, Train Steps/Sec: 9.48, Grad Norm: 0.0060 +[2025-02-20 14:43:00] (step=0950900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:43:08] (step=0951000) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:43:17] (step=0951100) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:43:26] (step=0951200) Train Loss: 0.1487, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-20 14:43:34] (step=0951300) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:43:43] (step=0951400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:43:52] (step=0951500) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 14:44:01] (step=0951600) Train Loss: 0.1498, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 14:44:09] (step=0951700) Train Loss: 0.1483, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:44:18] (step=0951800) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:44:26] (step=0951900) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:44:35] (step=0952000) Train Loss: 0.1497, Train Steps/Sec: 11.58, Grad Norm: 0.0059 +[2025-02-20 14:44:45] (step=0952100) Train Loss: 0.1501, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 14:44:54] (step=0952200) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:45:02] (step=0952300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:45:11] (step=0952400) Train Loss: 0.1491, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:45:19] (step=0952500) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 14:45:28] (step=0952600) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:45:37] (step=0952700) Train Loss: 0.1491, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:45:46] (step=0952800) Train Loss: 0.1500, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-20 14:45:55] (step=0952900) Train Loss: 0.1487, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-20 14:46:04] (step=0953000) Train Loss: 0.1503, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 14:46:13] (step=0953100) Train Loss: 0.1482, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:46:21] (step=0953200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 14:46:31] (step=0953300) Train Loss: 0.1504, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-20 14:46:40] (step=0953400) Train Loss: 0.1509, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 14:46:49] (step=0953500) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:46:57] (step=0953600) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:47:06] (step=0953700) Train Loss: 0.1478, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 14:47:15] (step=0953800) Train Loss: 0.1495, Train Steps/Sec: 11.27, Grad Norm: 0.0066 +[2025-02-20 14:47:23] (step=0953900) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:47:32] (step=0954000) Train Loss: 0.1512, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:47:41] (step=0954100) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:47:49] (step=0954200) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:47:58] (step=0954300) Train Loss: 0.1510, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:48:06] (step=0954400) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:48:15] (step=0954500) Train Loss: 0.1487, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 14:48:25] (step=0954600) Train Loss: 0.1495, Train Steps/Sec: 10.08, Grad Norm: 0.0063 +[2025-02-20 14:48:34] (step=0954700) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 14:48:42] (step=0954800) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 14:48:51] (step=0954900) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 14:49:00] (step=0955000) Train Loss: 0.1492, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-20 14:49:09] (step=0955100) Train Loss: 0.1505, Train Steps/Sec: 10.91, Grad Norm: 0.0063 +[2025-02-20 14:49:18] (step=0955200) Train Loss: 0.1499, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 14:49:27] (step=0955300) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:49:36] (step=0955400) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 14:49:44] (step=0955500) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:49:53] (step=0955600) Train Loss: 0.1499, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-20 14:50:02] (step=0955700) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 14:50:12] (step=0955800) Train Loss: 0.1501, Train Steps/Sec: 9.96, Grad Norm: 0.0059 +[2025-02-20 14:50:20] (step=0955900) Train Loss: 0.1493, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 14:50:29] (step=0956000) Train Loss: 0.1491, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 14:50:38] (step=0956100) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 14:50:46] (step=0956200) Train Loss: 0.1501, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 14:50:55] (step=0956300) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:51:04] (step=0956400) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 14:51:12] (step=0956500) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 14:51:21] (step=0956600) Train Loss: 0.1498, Train Steps/Sec: 11.58, Grad Norm: 0.0063 +[2025-02-20 14:51:29] (step=0956700) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:51:38] (step=0956800) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 14:51:47] (step=0956900) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 14:51:55] (step=0957000) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 14:52:05] (step=0957100) Train Loss: 0.1492, Train Steps/Sec: 10.05, Grad Norm: 0.0060 +[2025-02-20 14:52:15] (step=0957200) Train Loss: 0.1511, Train Steps/Sec: 10.24, Grad Norm: 0.0061 +[2025-02-20 14:52:24] (step=0957300) Train Loss: 0.1496, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-20 14:52:33] (step=0957400) Train Loss: 0.1505, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 14:52:41] (step=0957500) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:52:50] (step=0957600) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 14:52:59] (step=0957700) Train Loss: 0.1487, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-20 14:53:07] (step=0957800) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 14:53:16] (step=0957900) Train Loss: 0.1483, Train Steps/Sec: 11.63, Grad Norm: 0.0057 +[2025-02-20 14:53:25] (step=0958000) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:53:33] (step=0958100) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:53:42] (step=0958200) Train Loss: 0.1489, Train Steps/Sec: 11.26, Grad Norm: 0.0058 +[2025-02-20 14:53:52] (step=0958300) Train Loss: 0.1510, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-20 14:54:01] (step=0958400) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 14:54:09] (step=0958500) Train Loss: 0.1481, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 14:54:18] (step=0958600) Train Loss: 0.1488, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 14:54:27] (step=0958700) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 14:54:35] (step=0958800) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0057 +[2025-02-20 14:54:44] (step=0958900) Train Loss: 0.1502, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 14:54:52] (step=0959000) Train Loss: 0.1516, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 14:55:01] (step=0959100) Train Loss: 0.1507, Train Steps/Sec: 11.65, Grad Norm: 0.0058 +[2025-02-20 14:55:09] (step=0959200) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 14:55:19] (step=0959300) Train Loss: 0.1498, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 14:55:28] (step=0959400) Train Loss: 0.1510, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-20 14:55:37] (step=0959500) Train Loss: 0.1500, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-20 14:55:47] (step=0959600) Train Loss: 0.1492, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-20 14:55:56] (step=0959700) Train Loss: 0.1506, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-20 14:56:04] (step=0959800) Train Loss: 0.1500, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-20 14:56:13] (step=0959900) Train Loss: 0.1481, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 14:56:22] (step=0960000) Train Loss: 0.1487, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-20 14:56:22] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0960000.pt +[2025-02-20 14:56:31] (step=0960100) Train Loss: 0.1499, Train Steps/Sec: 10.63, Grad Norm: 0.0059 +[2025-02-20 14:56:40] (step=0960200) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:56:48] (step=0960300) Train Loss: 0.1505, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:56:57] (step=0960400) Train Loss: 0.1491, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-20 14:57:06] (step=0960500) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 14:57:14] (step=0960600) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:57:23] (step=0960700) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:57:33] (step=0960800) Train Loss: 0.1491, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 14:57:41] (step=0960900) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 14:57:50] (step=0961000) Train Loss: 0.1497, Train Steps/Sec: 11.65, Grad Norm: 0.0062 +[2025-02-20 14:57:59] (step=0961100) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 14:58:07] (step=0961200) Train Loss: 0.1508, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 14:58:16] (step=0961300) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:58:24] (step=0961400) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 14:58:34] (step=0961500) Train Loss: 0.1496, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-20 14:58:43] (step=0961600) Train Loss: 0.1500, Train Steps/Sec: 10.92, Grad Norm: 0.0064 +[2025-02-20 14:58:52] (step=0961700) Train Loss: 0.1507, Train Steps/Sec: 11.29, Grad Norm: 0.0058 +[2025-02-20 14:59:00] (step=0961800) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 14:59:09] (step=0961900) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 14:59:18] (step=0962000) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 14:59:28] (step=0962100) Train Loss: 0.1500, Train Steps/Sec: 9.78, Grad Norm: 0.0062 +[2025-02-20 14:59:36] (step=0962200) Train Loss: 0.1515, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 14:59:45] (step=0962300) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 14:59:54] (step=0962400) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 15:00:03] (step=0962500) Train Loss: 0.1497, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 15:00:11] (step=0962600) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 15:00:20] (step=0962700) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:00:28] (step=0962800) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:00:37] (step=0962900) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 15:00:46] (step=0963000) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:00:54] (step=0963100) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 15:01:03] (step=0963200) Train Loss: 0.1484, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:01:13] (step=0963300) Train Loss: 0.1499, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-20 15:01:21] (step=0963400) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:01:30] (step=0963500) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 15:01:39] (step=0963600) Train Loss: 0.1495, Train Steps/Sec: 11.64, Grad Norm: 0.0065 +[2025-02-20 15:01:48] (step=0963700) Train Loss: 0.1487, Train Steps/Sec: 10.59, Grad Norm: 0.0063 +[2025-02-20 15:01:57] (step=0963800) Train Loss: 0.1502, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-20 15:02:06] (step=0963900) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:02:15] (step=0964000) Train Loss: 0.1489, Train Steps/Sec: 11.65, Grad Norm: 0.0063 +[2025-02-20 15:02:23] (step=0964100) Train Loss: 0.1487, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:02:32] (step=0964200) Train Loss: 0.1487, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 15:02:41] (step=0964300) Train Loss: 0.1490, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 15:02:49] (step=0964400) Train Loss: 0.1504, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 15:02:58] (step=0964500) Train Loss: 0.1483, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:03:08] (step=0964600) Train Loss: 0.1508, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-20 15:03:17] (step=0964700) Train Loss: 0.1499, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-20 15:03:25] (step=0964800) Train Loss: 0.1498, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:03:34] (step=0964900) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 15:03:43] (step=0965000) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 15:03:51] (step=0965100) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 15:04:00] (step=0965200) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:04:08] (step=0965300) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:04:17] (step=0965400) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:04:26] (step=0965500) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0065 +[2025-02-20 15:04:34] (step=0965600) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:04:43] (step=0965700) Train Loss: 0.1509, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 15:04:53] (step=0965800) Train Loss: 0.1496, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-20 15:05:03] (step=0965900) Train Loss: 0.1507, Train Steps/Sec: 9.95, Grad Norm: 0.0061 +[2025-02-20 15:05:12] (step=0966000) Train Loss: 0.1498, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 15:05:20] (step=0966100) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:05:29] (step=0966200) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:05:38] (step=0966300) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 15:05:47] (step=0966400) Train Loss: 0.1507, Train Steps/Sec: 11.21, Grad Norm: 0.0064 +[2025-02-20 15:05:55] (step=0966500) Train Loss: 0.1502, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:06:04] (step=0966600) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:06:12] (step=0966700) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:06:21] (step=0966800) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:06:30] (step=0966900) Train Loss: 0.1489, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-20 15:06:38] (step=0967000) Train Loss: 0.1496, Train Steps/Sec: 11.57, Grad Norm: 0.0058 +[2025-02-20 15:06:48] (step=0967100) Train Loss: 0.1491, Train Steps/Sec: 9.98, Grad Norm: 0.0060 +[2025-02-20 15:06:57] (step=0967200) Train Loss: 0.1486, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 15:07:06] (step=0967300) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:07:14] (step=0967400) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:07:23] (step=0967500) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:07:31] (step=0967600) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:07:40] (step=0967700) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 15:07:49] (step=0967800) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 15:07:57] (step=0967900) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 15:08:06] (step=0968000) Train Loss: 0.1499, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-20 15:08:16] (step=0968100) Train Loss: 0.1500, Train Steps/Sec: 10.28, Grad Norm: 0.0059 +[2025-02-20 15:08:25] (step=0968200) Train Loss: 0.1496, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-20 15:08:35] (step=0968300) Train Loss: 0.1491, Train Steps/Sec: 9.93, Grad Norm: 0.0058 +[2025-02-20 15:08:43] (step=0968400) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:08:52] (step=0968500) Train Loss: 0.1487, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 15:09:01] (step=0968600) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:09:10] (step=0968700) Train Loss: 0.1508, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:09:18] (step=0968800) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:09:27] (step=0968900) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:09:35] (step=0969000) Train Loss: 0.1506, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:09:44] (step=0969100) Train Loss: 0.1505, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-20 15:09:53] (step=0969200) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:10:01] (step=0969300) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:10:10] (step=0969400) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:10:19] (step=0969500) Train Loss: 0.1500, Train Steps/Sec: 11.58, Grad Norm: 0.0062 +[2025-02-20 15:10:29] (step=0969600) Train Loss: 0.1501, Train Steps/Sec: 10.04, Grad Norm: 0.0058 +[2025-02-20 15:10:37] (step=0969700) Train Loss: 0.1488, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:10:46] (step=0969800) Train Loss: 0.1488, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:10:54] (step=0969900) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:11:03] (step=0970000) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:11:12] (step=0970100) Train Loss: 0.1506, Train Steps/Sec: 11.60, Grad Norm: 0.0058 +[2025-02-20 15:11:21] (step=0970200) Train Loss: 0.1520, Train Steps/Sec: 10.56, Grad Norm: 0.0060 +[2025-02-20 15:11:30] (step=0970300) Train Loss: 0.1495, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-20 15:11:39] (step=0970400) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 15:11:48] (step=0970500) Train Loss: 0.1489, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 15:11:56] (step=0970600) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 15:12:05] (step=0970700) Train Loss: 0.1497, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-20 15:12:15] (step=0970800) Train Loss: 0.1503, Train Steps/Sec: 9.97, Grad Norm: 0.0060 +[2025-02-20 15:12:24] (step=0970900) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 15:12:32] (step=0971000) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:12:41] (step=0971100) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:12:50] (step=0971200) Train Loss: 0.1504, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 15:12:58] (step=0971300) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:13:07] (step=0971400) Train Loss: 0.1481, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:13:16] (step=0971500) Train Loss: 0.1482, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:13:24] (step=0971600) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:13:33] (step=0971700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:13:41] (step=0971800) Train Loss: 0.1487, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 15:13:50] (step=0971900) Train Loss: 0.1499, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 15:13:59] (step=0972000) Train Loss: 0.1504, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 15:14:09] (step=0972100) Train Loss: 0.1498, Train Steps/Sec: 10.02, Grad Norm: 0.0058 +[2025-02-20 15:14:17] (step=0972200) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0057 +[2025-02-20 15:14:26] (step=0972300) Train Loss: 0.1518, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:14:35] (step=0972400) Train Loss: 0.1479, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-20 15:14:44] (step=0972500) Train Loss: 0.1492, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 15:14:53] (step=0972600) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:15:02] (step=0972700) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:15:10] (step=0972800) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:15:19] (step=0972900) Train Loss: 0.1491, Train Steps/Sec: 11.24, Grad Norm: 0.0057 +[2025-02-20 15:15:28] (step=0973000) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:15:36] (step=0973100) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:15:45] (step=0973200) Train Loss: 0.1509, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:15:55] (step=0973300) Train Loss: 0.1482, Train Steps/Sec: 10.04, Grad Norm: 0.0061 +[2025-02-20 15:16:04] (step=0973400) Train Loss: 0.1491, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 15:16:12] (step=0973500) Train Loss: 0.1497, Train Steps/Sec: 11.61, Grad Norm: 0.0066 +[2025-02-20 15:16:21] (step=0973600) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:16:30] (step=0973700) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:16:38] (step=0973800) Train Loss: 0.1491, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 15:16:47] (step=0973900) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 15:16:55] (step=0974000) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 15:17:04] (step=0974100) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:17:13] (step=0974200) Train Loss: 0.1498, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:17:21] (step=0974300) Train Loss: 0.1504, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:17:30] (step=0974400) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 15:17:39] (step=0974500) Train Loss: 0.1509, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 15:17:50] (step=0974600) Train Loss: 0.1502, Train Steps/Sec: 9.04, Grad Norm: 0.0060 +[2025-02-20 15:17:59] (step=0974700) Train Loss: 0.1504, Train Steps/Sec: 11.29, Grad Norm: 0.0069 +[2025-02-20 15:18:07] (step=0974800) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0068 +[2025-02-20 15:18:16] (step=0974900) Train Loss: 0.1512, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:18:25] (step=0975000) Train Loss: 0.1500, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-20 15:18:33] (step=0975100) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:18:42] (step=0975200) Train Loss: 0.1482, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 15:18:50] (step=0975300) Train Loss: 0.1491, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 15:18:59] (step=0975400) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:19:08] (step=0975500) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0064 +[2025-02-20 15:19:16] (step=0975600) Train Loss: 0.1480, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-20 15:19:25] (step=0975700) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 15:19:35] (step=0975800) Train Loss: 0.1487, Train Steps/Sec: 9.89, Grad Norm: 0.0061 +[2025-02-20 15:19:44] (step=0975900) Train Loss: 0.1509, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:19:52] (step=0976000) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 15:20:01] (step=0976100) Train Loss: 0.1506, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:20:09] (step=0976200) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:20:18] (step=0976300) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:20:27] (step=0976400) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:20:35] (step=0976500) Train Loss: 0.1485, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:20:44] (step=0976600) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 15:20:53] (step=0976700) Train Loss: 0.1493, Train Steps/Sec: 10.90, Grad Norm: 0.0059 +[2025-02-20 15:21:02] (step=0976800) Train Loss: 0.1509, Train Steps/Sec: 10.87, Grad Norm: 0.0061 +[2025-02-20 15:21:11] (step=0976900) Train Loss: 0.1495, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 15:21:20] (step=0977000) Train Loss: 0.1502, Train Steps/Sec: 11.56, Grad Norm: 0.0064 +[2025-02-20 15:21:30] (step=0977100) Train Loss: 0.1505, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-20 15:21:39] (step=0977200) Train Loss: 0.1501, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-20 15:21:47] (step=0977300) Train Loss: 0.1480, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:21:56] (step=0977400) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 15:22:05] (step=0977500) Train Loss: 0.1499, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:22:13] (step=0977600) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:22:22] (step=0977700) Train Loss: 0.1499, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-20 15:22:31] (step=0977800) Train Loss: 0.1495, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:22:39] (step=0977900) Train Loss: 0.1509, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:22:48] (step=0978000) Train Loss: 0.1487, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:22:56] (step=0978100) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 15:23:05] (step=0978200) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:23:15] (step=0978300) Train Loss: 0.1484, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-20 15:23:24] (step=0978400) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 15:23:32] (step=0978500) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0063 +[2025-02-20 15:23:41] (step=0978600) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:23:49] (step=0978700) Train Loss: 0.1499, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 15:23:58] (step=0978800) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:24:07] (step=0978900) Train Loss: 0.1489, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-20 15:24:16] (step=0979000) Train Loss: 0.1494, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 15:24:25] (step=0979100) Train Loss: 0.1503, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 15:24:34] (step=0979200) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:24:42] (step=0979300) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:24:51] (step=0979400) Train Loss: 0.1489, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 15:25:00] (step=0979500) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 15:25:10] (step=0979600) Train Loss: 0.1493, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-20 15:25:19] (step=0979700) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:25:27] (step=0979800) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:25:36] (step=0979900) Train Loss: 0.1495, Train Steps/Sec: 11.27, Grad Norm: 0.0064 +[2025-02-20 15:25:45] (step=0980000) Train Loss: 0.1507, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:25:46] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/0980000.pt +[2025-02-20 15:25:55] (step=0980100) Train Loss: 0.1496, Train Steps/Sec: 10.10, Grad Norm: 0.0059 +[2025-02-20 15:26:03] (step=0980200) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:26:12] (step=0980300) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 15:26:20] (step=0980400) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0064 +[2025-02-20 15:26:29] (step=0980500) Train Loss: 0.1493, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 15:26:38] (step=0980600) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:26:46] (step=0980700) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0065 +[2025-02-20 15:26:56] (step=0980800) Train Loss: 0.1496, Train Steps/Sec: 10.04, Grad Norm: 0.0064 +[2025-02-20 15:27:05] (step=0980900) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:27:13] (step=0981000) Train Loss: 0.1504, Train Steps/Sec: 11.62, Grad Norm: 0.0057 +[2025-02-20 15:27:23] (step=0981100) Train Loss: 0.1493, Train Steps/Sec: 10.24, Grad Norm: 0.0059 +[2025-02-20 15:27:32] (step=0981200) Train Loss: 0.1484, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-20 15:27:41] (step=0981300) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 15:27:49] (step=0981400) Train Loss: 0.1490, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:27:58] (step=0981500) Train Loss: 0.1495, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-20 15:28:07] (step=0981600) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:28:15] (step=0981700) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:28:24] (step=0981800) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:28:33] (step=0981900) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:28:41] (step=0982000) Train Loss: 0.1512, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:28:51] (step=0982100) Train Loss: 0.1492, Train Steps/Sec: 9.78, Grad Norm: 0.0064 +[2025-02-20 15:29:00] (step=0982200) Train Loss: 0.1498, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 15:29:09] (step=0982300) Train Loss: 0.1505, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:29:17] (step=0982400) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:29:26] (step=0982500) Train Loss: 0.1489, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:29:34] (step=0982600) Train Loss: 0.1500, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:29:43] (step=0982700) Train Loss: 0.1476, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:29:52] (step=0982800) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:30:00] (step=0982900) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:30:09] (step=0983000) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:30:17] (step=0983100) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:30:26] (step=0983200) Train Loss: 0.1509, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 15:30:37] (step=0983300) Train Loss: 0.1494, Train Steps/Sec: 8.96, Grad Norm: 0.0060 +[2025-02-20 15:30:46] (step=0983400) Train Loss: 0.1486, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-20 15:30:55] (step=0983500) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 15:31:04] (step=0983600) Train Loss: 0.1490, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 15:31:12] (step=0983700) Train Loss: 0.1500, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-20 15:31:21] (step=0983800) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 15:31:30] (step=0983900) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 15:31:38] (step=0984000) Train Loss: 0.1497, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 15:31:47] (step=0984100) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:31:56] (step=0984200) Train Loss: 0.1494, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:32:04] (step=0984300) Train Loss: 0.1511, Train Steps/Sec: 11.26, Grad Norm: 0.0058 +[2025-02-20 15:32:13] (step=0984400) Train Loss: 0.1483, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:32:22] (step=0984500) Train Loss: 0.1501, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:32:32] (step=0984600) Train Loss: 0.1495, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-20 15:32:40] (step=0984700) Train Loss: 0.1501, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:32:49] (step=0984800) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0064 +[2025-02-20 15:32:57] (step=0984900) Train Loss: 0.1492, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:33:06] (step=0985000) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:33:15] (step=0985100) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0057 +[2025-02-20 15:33:23] (step=0985200) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:33:32] (step=0985300) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:33:41] (step=0985400) Train Loss: 0.1503, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-20 15:33:50] (step=0985500) Train Loss: 0.1505, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-20 15:33:59] (step=0985600) Train Loss: 0.1496, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-20 15:34:08] (step=0985700) Train Loss: 0.1514, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:34:18] (step=0985800) Train Loss: 0.1481, Train Steps/Sec: 9.73, Grad Norm: 0.0066 +[2025-02-20 15:34:27] (step=0985900) Train Loss: 0.1507, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 15:34:35] (step=0986000) Train Loss: 0.1500, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-20 15:34:44] (step=0986100) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-20 15:34:52] (step=0986200) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:35:01] (step=0986300) Train Loss: 0.1491, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 15:35:10] (step=0986400) Train Loss: 0.1492, Train Steps/Sec: 11.24, Grad Norm: 0.0058 +[2025-02-20 15:35:19] (step=0986500) Train Loss: 0.1492, Train Steps/Sec: 11.59, Grad Norm: 0.0061 +[2025-02-20 15:35:27] (step=0986600) Train Loss: 0.1504, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 15:35:36] (step=0986700) Train Loss: 0.1492, Train Steps/Sec: 11.60, Grad Norm: 0.0062 +[2025-02-20 15:35:44] (step=0986800) Train Loss: 0.1486, Train Steps/Sec: 11.60, Grad Norm: 0.0060 +[2025-02-20 15:35:53] (step=0986900) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:36:02] (step=0987000) Train Loss: 0.1511, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 15:36:12] (step=0987100) Train Loss: 0.1491, Train Steps/Sec: 9.91, Grad Norm: 0.0060 +[2025-02-20 15:36:20] (step=0987200) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:36:29] (step=0987300) Train Loss: 0.1502, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:36:38] (step=0987400) Train Loss: 0.1511, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 15:36:46] (step=0987500) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:36:56] (step=0987600) Train Loss: 0.1499, Train Steps/Sec: 10.59, Grad Norm: 0.0058 +[2025-02-20 15:37:04] (step=0987700) Train Loss: 0.1489, Train Steps/Sec: 11.25, Grad Norm: 0.0057 +[2025-02-20 15:37:13] (step=0987800) Train Loss: 0.1490, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 15:37:22] (step=0987900) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 15:37:31] (step=0988000) Train Loss: 0.1508, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-20 15:37:39] (step=0988100) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:37:48] (step=0988200) Train Loss: 0.1488, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:37:58] (step=0988300) Train Loss: 0.1509, Train Steps/Sec: 9.98, Grad Norm: 0.0065 +[2025-02-20 15:38:07] (step=0988400) Train Loss: 0.1497, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 15:38:15] (step=0988500) Train Loss: 0.1492, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:38:24] (step=0988600) Train Loss: 0.1500, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-20 15:38:33] (step=0988700) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:38:41] (step=0988800) Train Loss: 0.1487, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:38:50] (step=0988900) Train Loss: 0.1511, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:38:59] (step=0989000) Train Loss: 0.1498, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:39:07] (step=0989100) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:39:16] (step=0989200) Train Loss: 0.1484, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:39:24] (step=0989300) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:39:33] (step=0989400) Train Loss: 0.1487, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:39:42] (step=0989500) Train Loss: 0.1493, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:39:51] (step=0989600) Train Loss: 0.1496, Train Steps/Sec: 10.06, Grad Norm: 0.0063 +[2025-02-20 15:40:00] (step=0989700) Train Loss: 0.1505, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 15:40:10] (step=0989800) Train Loss: 0.1488, Train Steps/Sec: 10.55, Grad Norm: 0.0059 +[2025-02-20 15:40:19] (step=0989900) Train Loss: 0.1505, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-20 15:40:27] (step=0990000) Train Loss: 0.1478, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:40:36] (step=0990100) Train Loss: 0.1498, Train Steps/Sec: 11.64, Grad Norm: 0.0058 +[2025-02-20 15:40:45] (step=0990200) Train Loss: 0.1504, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 15:40:53] (step=0990300) Train Loss: 0.1506, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-20 15:41:02] (step=0990400) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0059 +[2025-02-20 15:41:11] (step=0990500) Train Loss: 0.1498, Train Steps/Sec: 11.65, Grad Norm: 0.0059 +[2025-02-20 15:41:19] (step=0990600) Train Loss: 0.1500, Train Steps/Sec: 11.62, Grad Norm: 0.0058 +[2025-02-20 15:41:28] (step=0990700) Train Loss: 0.1511, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:41:38] (step=0990800) Train Loss: 0.1491, Train Steps/Sec: 9.80, Grad Norm: 0.0065 +[2025-02-20 15:41:46] (step=0990900) Train Loss: 0.1492, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-20 15:41:55] (step=0991000) Train Loss: 0.1496, Train Steps/Sec: 11.64, Grad Norm: 0.0062 +[2025-02-20 15:42:04] (step=0991100) Train Loss: 0.1493, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 15:42:12] (step=0991200) Train Loss: 0.1501, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 15:42:21] (step=0991300) Train Loss: 0.1482, Train Steps/Sec: 11.54, Grad Norm: 0.0060 +[2025-02-20 15:42:30] (step=0991400) Train Loss: 0.1510, Train Steps/Sec: 11.55, Grad Norm: 0.0060 +[2025-02-20 15:42:38] (step=0991500) Train Loss: 0.1499, Train Steps/Sec: 11.52, Grad Norm: 0.0062 +[2025-02-20 15:42:47] (step=0991600) Train Loss: 0.1490, Train Steps/Sec: 11.49, Grad Norm: 0.0058 +[2025-02-20 15:42:56] (step=0991700) Train Loss: 0.1499, Train Steps/Sec: 11.48, Grad Norm: 0.0060 +[2025-02-20 15:43:05] (step=0991800) Train Loss: 0.1496, Train Steps/Sec: 11.49, Grad Norm: 0.0063 +[2025-02-20 15:43:13] (step=0991900) Train Loss: 0.1489, Train Steps/Sec: 11.16, Grad Norm: 0.0060 +[2025-02-20 15:43:23] (step=0992000) Train Loss: 0.1499, Train Steps/Sec: 10.11, Grad Norm: 0.0060 +[2025-02-20 15:43:34] (step=0992100) Train Loss: 0.1494, Train Steps/Sec: 9.49, Grad Norm: 0.0061 +[2025-02-20 15:43:43] (step=0992200) Train Loss: 0.1504, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-20 15:43:51] (step=0992300) Train Loss: 0.1495, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-20 15:44:00] (step=0992400) Train Loss: 0.1501, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 15:44:09] (step=0992500) Train Loss: 0.1487, Train Steps/Sec: 11.57, Grad Norm: 0.0063 +[2025-02-20 15:44:17] (step=0992600) Train Loss: 0.1503, Train Steps/Sec: 11.56, Grad Norm: 0.0061 +[2025-02-20 15:44:26] (step=0992700) Train Loss: 0.1494, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 15:44:35] (step=0992800) Train Loss: 0.1492, Train Steps/Sec: 11.56, Grad Norm: 0.0059 +[2025-02-20 15:44:43] (step=0992900) Train Loss: 0.1487, Train Steps/Sec: 11.56, Grad Norm: 0.0063 +[2025-02-20 15:44:52] (step=0993000) Train Loss: 0.1502, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-20 15:45:01] (step=0993100) Train Loss: 0.1484, Train Steps/Sec: 11.56, Grad Norm: 0.0060 +[2025-02-20 15:45:10] (step=0993200) Train Loss: 0.1488, Train Steps/Sec: 11.57, Grad Norm: 0.0061 +[2025-02-20 15:45:20] (step=0993300) Train Loss: 0.1500, Train Steps/Sec: 9.94, Grad Norm: 0.0060 +[2025-02-20 15:45:28] (step=0993400) Train Loss: 0.1506, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:45:37] (step=0993500) Train Loss: 0.1489, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:45:45] (step=0993600) Train Loss: 0.1503, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:45:54] (step=0993700) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:46:03] (step=0993800) Train Loss: 0.1504, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:46:11] (step=0993900) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:46:20] (step=0994000) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:46:29] (step=0994100) Train Loss: 0.1510, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-20 15:46:38] (step=0994200) Train Loss: 0.1508, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-20 15:46:47] (step=0994300) Train Loss: 0.1482, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-20 15:46:56] (step=0994400) Train Loss: 0.1494, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-20 15:47:05] (step=0994500) Train Loss: 0.1491, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-20 15:47:15] (step=0994600) Train Loss: 0.1499, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-20 15:47:23] (step=0994700) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 15:47:32] (step=0994800) Train Loss: 0.1492, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:47:41] (step=0994900) Train Loss: 0.1503, Train Steps/Sec: 11.61, Grad Norm: 0.0059 +[2025-02-20 15:47:49] (step=0995000) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:47:58] (step=0995100) Train Loss: 0.1502, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-20 15:48:07] (step=0995200) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0063 +[2025-02-20 15:48:15] (step=0995300) Train Loss: 0.1505, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:48:24] (step=0995400) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0063 +[2025-02-20 15:48:32] (step=0995500) Train Loss: 0.1489, Train Steps/Sec: 11.62, Grad Norm: 0.0059 +[2025-02-20 15:48:41] (step=0995600) Train Loss: 0.1482, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:48:50] (step=0995700) Train Loss: 0.1490, Train Steps/Sec: 11.64, Grad Norm: 0.0057 +[2025-02-20 15:49:00] (step=0995800) Train Loss: 0.1500, Train Steps/Sec: 10.06, Grad Norm: 0.0059 +[2025-02-20 15:49:08] (step=0995900) Train Loss: 0.1496, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:49:17] (step=0996000) Train Loss: 0.1493, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:49:25] (step=0996100) Train Loss: 0.1495, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:49:34] (step=0996200) Train Loss: 0.1503, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:49:44] (step=0996300) Train Loss: 0.1503, Train Steps/Sec: 10.24, Grad Norm: 0.0061 +[2025-02-20 15:49:53] (step=0996400) Train Loss: 0.1495, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-20 15:50:02] (step=0996500) Train Loss: 0.1501, Train Steps/Sec: 11.60, Grad Norm: 0.0061 +[2025-02-20 15:50:10] (step=0996600) Train Loss: 0.1486, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:50:19] (step=0996700) Train Loss: 0.1512, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-20 15:50:28] (step=0996800) Train Loss: 0.1500, Train Steps/Sec: 11.61, Grad Norm: 0.0062 +[2025-02-20 15:50:36] (step=0996900) Train Loss: 0.1502, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:50:45] (step=0997000) Train Loss: 0.1514, Train Steps/Sec: 11.62, Grad Norm: 0.0062 +[2025-02-20 15:50:55] (step=0997100) Train Loss: 0.1500, Train Steps/Sec: 9.99, Grad Norm: 0.0058 +[2025-02-20 15:51:04] (step=0997200) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:51:12] (step=0997300) Train Loss: 0.1496, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 15:51:21] (step=0997400) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:51:30] (step=0997500) Train Loss: 0.1495, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:51:38] (step=0997600) Train Loss: 0.1499, Train Steps/Sec: 11.61, Grad Norm: 0.0060 +[2025-02-20 15:51:47] (step=0997700) Train Loss: 0.1506, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:51:55] (step=0997800) Train Loss: 0.1494, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:52:04] (step=0997900) Train Loss: 0.1488, Train Steps/Sec: 11.61, Grad Norm: 0.0061 +[2025-02-20 15:52:13] (step=0998000) Train Loss: 0.1496, Train Steps/Sec: 11.61, Grad Norm: 0.0058 +[2025-02-20 15:52:21] (step=0998100) Train Loss: 0.1500, Train Steps/Sec: 11.60, Grad Norm: 0.0059 +[2025-02-20 15:52:30] (step=0998200) Train Loss: 0.1499, Train Steps/Sec: 11.59, Grad Norm: 0.0064 +[2025-02-20 15:52:40] (step=0998300) Train Loss: 0.1504, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-20 15:52:49] (step=0998400) Train Loss: 0.1490, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-20 15:52:59] (step=0998500) Train Loss: 0.1494, Train Steps/Sec: 10.27, Grad Norm: 0.0058 +[2025-02-20 15:53:07] (step=0998600) Train Loss: 0.1500, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-20 15:53:16] (step=0998700) Train Loss: 0.1502, Train Steps/Sec: 11.63, Grad Norm: 0.0062 +[2025-02-20 15:53:25] (step=0998800) Train Loss: 0.1507, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-20 15:53:34] (step=0998900) Train Loss: 0.1501, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-20 15:53:42] (step=0999000) Train Loss: 0.1513, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:53:51] (step=0999100) Train Loss: 0.1494, Train Steps/Sec: 11.62, Grad Norm: 0.0061 +[2025-02-20 15:53:59] (step=0999200) Train Loss: 0.1499, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:54:08] (step=0999300) Train Loss: 0.1496, Train Steps/Sec: 11.63, Grad Norm: 0.0058 +[2025-02-20 15:54:17] (step=0999400) Train Loss: 0.1508, Train Steps/Sec: 11.63, Grad Norm: 0.0060 +[2025-02-20 15:54:25] (step=0999500) Train Loss: 0.1490, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-20 15:54:35] (step=0999600) Train Loss: 0.1480, Train Steps/Sec: 10.02, Grad Norm: 0.0060 +[2025-02-20 15:54:44] (step=0999700) Train Loss: 0.1497, Train Steps/Sec: 11.63, Grad Norm: 0.0061 +[2025-02-20 15:54:53] (step=0999800) Train Loss: 0.1501, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-20 15:55:01] (step=0999900) Train Loss: 0.1493, Train Steps/Sec: 11.64, Grad Norm: 0.0061 +[2025-02-20 15:55:10] (step=1000000) Train Loss: 0.1486, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-20 15:55:10] Saved checkpoint to ../logs/smo/diff_s_1000kx1024_qknorm/checkpoints/1000000.pt +[2025-02-20 16:10:31] Calculating FID with 50000 number of samples +[2025-02-20 16:11:14] (step=1000000), Fid=44.57126887478353 +[2025-02-20 16:11:15] Done!