diff --git "a/diff_b_1000kx1024_qknorm/log.txt" "b/diff_b_1000kx1024_qknorm/log.txt" new file mode 100644--- /dev/null +++ "b/diff_b_1000kx1024_qknorm/log.txt" @@ -0,0 +1,10291 @@ +[2025-02-20 22:28:18] Model: DistributedDataParallel( + (module): DiT( + (x_embedder): PatchEmbed( + (proj): Conv2d(4, 768, kernel_size=(2, 2), stride=(2, 2)) + (norm): Identity() + ) + (t_embedder): TimestepEmbedder( + (mlp): Sequential( + (0): Linear(in_features=256, out_features=768, bias=True) + (1): SiLU() + (2): Linear(in_features=768, out_features=768, bias=True) + ) + ) + (y_embedder): LabelEmbedder( + (embedding_table): Embedding(1001, 768) + ) + (blocks): ModuleList( + (0-11): 12 x DiTBlock( + (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=False) + (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=False) + (attn): Attention( + (qkv): Linear(in_features=768, out_features=2304, bias=True) + (q_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True) + (k_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=768, out_features=768, bias=True) + (proj_drop): Dropout(p=0.0, inplace=False) + ) + (mlp): Mlp( + (fc1): Linear(in_features=768, out_features=3072, bias=True) + (act): GELU(approximate='tanh') + (drop1): Dropout(p=0, inplace=False) + (norm): Identity() + (fc2): Linear(in_features=3072, out_features=768, bias=True) + (drop2): Dropout(p=0, inplace=False) + ) + (adaLN_modulation): Sequential( + (0): SiLU() + (1): Linear(in_features=768, out_features=4608, bias=True) + ) + ) + ) + (final_layer): FinalLayer( + (norm_final): LayerNorm((768,), eps=1e-06, elementwise_affine=False) + (linear): Linear(in_features=768, out_features=32, bias=True) + (adaLN_modulation): Sequential( + (0): SiLU() + (1): Linear(in_features=768, out_features=1536, bias=True) + ) + ) + ) +) +[2025-02-20 22:28:18] DiT Parameters: 130.52M +[2025-02-20 22:28:18] DiT Trainable Parameters: 130.32M +[2025-02-20 22:28:18] Optimizer: AdamW, lr=0.0002, beta2=0.999 +[2025-02-20 22:28:18] module.pos_embed.requires_grad : False +[2025-02-20 22:28:18] module.x_embedder.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.x_embedder.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.t_embedder.mlp.0.weight.requires_grad : True +[2025-02-20 22:28:18] module.t_embedder.mlp.0.bias.requires_grad : True +[2025-02-20 22:28:18] module.t_embedder.mlp.2.weight.requires_grad : True +[2025-02-20 22:28:18] module.t_embedder.mlp.2.bias.requires_grad : True +[2025-02-20 22:28:18] module.y_embedder.embedding_table.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.0.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.1.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.2.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.3.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.4.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.5.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.6.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.7.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.8.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.9.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.10.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.qkv.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.qkv.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.q_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.q_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.k_norm.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.k_norm.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.proj.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.attn.proj.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.mlp.fc1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.mlp.fc1.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.mlp.fc2.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.mlp.fc2.bias.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.blocks.11.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:18] module.final_layer.linear.weight.requires_grad : True +[2025-02-20 22:28:18] module.final_layer.linear.bias.requires_grad : True +[2025-02-20 22:28:18] module.final_layer.adaLN_modulation.1.weight.requires_grad : True +[2025-02-20 22:28:18] module.final_layer.adaLN_modulation.1.bias.requires_grad : True +[2025-02-20 22:28:19] Dataset contains 1,281,168 images /data/checkpoints/LanguageBind/offline_feature/offline_vae_256/imagenet_train_256 +[2025-02-20 22:28:19] Batch size 32 per gpu, with 1024 global batch size +[2025-02-20 22:28:19] Train config: {'ckpt_path': '/data/logs/smo/diff_b_1000kx1024_qknorm/checkpoints/1000000.pt', 'data': {'data_path': '/data/checkpoints/LanguageBind/offline_feature/offline_vae_256/imagenet_train_256', 'fid_reference_file': '/data/checkpoints/VIRTUAL_imagenet256_labeled.npz', 'image_size': 256, 'num_classes': 1000, 'num_workers': 16, 'latent_norm': False, 'latent_multiplier': 0.18215}, 'vae': {'model_path': '/data/checkpoints/stabilityai/sd-vae-ft-ema', 'downsample_ratio': 8}, 'model': {'model_type': 'DiT-B/2', 'use_qknorm': True, 'use_swiglu': False, 'use_rope': False, 'use_rmsnorm': False, 'in_chans': 4, 'use_checkpoint': False}, 'train': {'max_steps': 1000000, 'global_batch_size': 1024, 'global_seed': 0, 'output_dir': '../logs/smo/diff_b_1000kx1024_qknorm', 'ckpt': None, 'log_every': 100, 'ckpt_every': 20000, 'wandb': True, 'seed': 1234, 'precision': 'bf16', 'resume': False}, 'optimizer': {'lr': 0.0002, 'beta2': 0.999}, 'wandb': {'proj_name': 'smo', 'log_name': 'diff_b_1000kx1024_qknorm', 'key': '953e958793b218efb850fa194e85843e2c3bd88b'}, 'scheduler': {'diffusion': True, 'transport': False}, 'diffusion': {'learn_sigma': True, 'diffusion_steps': 1000}, 'transport': {'path_type': 'Linear', 'prediction': 'velocity', 'loss_weight': None, 'sample_eps': None, 'train_eps': None, 'use_cosine_loss': True, 'use_lognorm': True}, 'sample': {'num_sampling_steps': 250, 'cfg_scale': 1.5, 'per_proc_batch_size': 64, 'fid_num': 50000, 'cfg_interval_start': 0.0}} +[2025-02-20 22:28:54] (step=0000100) Train Loss: 0.3101, Train Steps/Sec: 2.81, Grad Norm: 0.5879 +[2025-02-20 22:29:03] (step=0000200) Train Loss: 0.1976, Train Steps/Sec: 11.79, Grad Norm: 0.2441 +[2025-02-20 22:29:12] (step=0000300) Train Loss: 0.1882, Train Steps/Sec: 10.79, Grad Norm: 0.2604 +[2025-02-20 22:29:23] (step=0000400) Train Loss: 0.1867, Train Steps/Sec: 9.00, Grad Norm: 0.2583 +[2025-02-20 22:29:31] (step=0000500) Train Loss: 0.1818, Train Steps/Sec: 12.11, Grad Norm: 0.2545 +[2025-02-20 22:29:39] (step=0000600) Train Loss: 0.1809, Train Steps/Sec: 12.53, Grad Norm: 0.2507 +[2025-02-20 22:29:48] (step=0000700) Train Loss: 0.1774, Train Steps/Sec: 11.49, Grad Norm: 0.2202 +[2025-02-20 22:29:56] (step=0000800) Train Loss: 0.1781, Train Steps/Sec: 12.52, Grad Norm: 0.2145 +[2025-02-20 22:30:04] (step=0000900) Train Loss: 0.1769, Train Steps/Sec: 12.52, Grad Norm: 0.1930 +[2025-02-20 22:30:12] (step=0001000) Train Loss: 0.1756, Train Steps/Sec: 12.51, Grad Norm: 0.1997 +[2025-02-20 22:30:20] (step=0001100) Train Loss: 0.1730, Train Steps/Sec: 12.52, Grad Norm: 0.1838 +[2025-02-20 22:30:28] (step=0001200) Train Loss: 0.1731, Train Steps/Sec: 12.53, Grad Norm: 0.1913 +[2025-02-20 22:30:37] (step=0001300) Train Loss: 0.1736, Train Steps/Sec: 11.04, Grad Norm: 0.1670 +[2025-02-20 22:30:45] (step=0001400) Train Loss: 0.1715, Train Steps/Sec: 12.56, Grad Norm: 0.1705 +[2025-02-20 22:30:53] (step=0001500) Train Loss: 0.1714, Train Steps/Sec: 12.60, Grad Norm: 0.1539 +[2025-02-20 22:31:01] (step=0001600) Train Loss: 0.1695, Train Steps/Sec: 12.59, Grad Norm: 0.1564 +[2025-02-20 22:31:09] (step=0001700) Train Loss: 0.1686, Train Steps/Sec: 12.60, Grad Norm: 0.1617 +[2025-02-20 22:31:17] (step=0001800) Train Loss: 0.1697, Train Steps/Sec: 12.61, Grad Norm: 0.1487 +[2025-02-20 22:31:25] (step=0001900) Train Loss: 0.1698, Train Steps/Sec: 12.60, Grad Norm: 0.1410 +[2025-02-20 22:31:33] (step=0002000) Train Loss: 0.1675, Train Steps/Sec: 12.58, Grad Norm: 0.1417 +[2025-02-20 22:31:41] (step=0002100) Train Loss: 0.1676, Train Steps/Sec: 12.60, Grad Norm: 0.1355 +[2025-02-20 22:31:49] (step=0002200) Train Loss: 0.1670, Train Steps/Sec: 12.59, Grad Norm: 0.1348 +[2025-02-20 22:31:56] (step=0002300) Train Loss: 0.1677, Train Steps/Sec: 12.58, Grad Norm: 0.1330 +[2025-02-20 22:32:05] (step=0002400) Train Loss: 0.1664, Train Steps/Sec: 12.10, Grad Norm: 0.1257 +[2025-02-20 22:32:14] (step=0002500) Train Loss: 0.1678, Train Steps/Sec: 11.18, Grad Norm: 0.1254 +[2025-02-20 22:32:25] (step=0002600) Train Loss: 0.1659, Train Steps/Sec: 8.73, Grad Norm: 0.1213 +[2025-02-20 22:32:33] (step=0002700) Train Loss: 0.1665, Train Steps/Sec: 12.58, Grad Norm: 0.1170 +[2025-02-20 22:32:41] (step=0002800) Train Loss: 0.1673, Train Steps/Sec: 12.61, Grad Norm: 0.1232 +[2025-02-20 22:32:50] (step=0002900) Train Loss: 0.1657, Train Steps/Sec: 11.74, Grad Norm: 0.1130 +[2025-02-20 22:32:58] (step=0003000) Train Loss: 0.1677, Train Steps/Sec: 12.62, Grad Norm: 0.1238 +[2025-02-20 22:33:05] (step=0003100) Train Loss: 0.1640, Train Steps/Sec: 12.62, Grad Norm: 0.1108 +[2025-02-20 22:33:13] (step=0003200) Train Loss: 0.1671, Train Steps/Sec: 12.62, Grad Norm: 0.1096 +[2025-02-20 22:33:21] (step=0003300) Train Loss: 0.1653, Train Steps/Sec: 12.62, Grad Norm: 0.1036 +[2025-02-20 22:33:29] (step=0003400) Train Loss: 0.1659, Train Steps/Sec: 12.62, Grad Norm: 0.1074 +[2025-02-20 22:33:37] (step=0003500) Train Loss: 0.1652, Train Steps/Sec: 12.62, Grad Norm: 0.1033 +[2025-02-20 22:33:45] (step=0003600) Train Loss: 0.1651, Train Steps/Sec: 12.63, Grad Norm: 0.1088 +[2025-02-20 22:33:53] (step=0003700) Train Loss: 0.1634, Train Steps/Sec: 12.63, Grad Norm: 0.1061 +[2025-02-20 22:34:02] (step=0003800) Train Loss: 0.1641, Train Steps/Sec: 11.30, Grad Norm: 0.1017 +[2025-02-20 22:34:10] (step=0003900) Train Loss: 0.1637, Train Steps/Sec: 12.59, Grad Norm: 0.0990 +[2025-02-20 22:34:18] (step=0004000) Train Loss: 0.1641, Train Steps/Sec: 12.61, Grad Norm: 0.1006 +[2025-02-20 22:34:26] (step=0004100) Train Loss: 0.1624, Train Steps/Sec: 12.62, Grad Norm: 0.0938 +[2025-02-20 22:34:34] (step=0004200) Train Loss: 0.1646, Train Steps/Sec: 12.60, Grad Norm: 0.0909 +[2025-02-20 22:34:41] (step=0004300) Train Loss: 0.1625, Train Steps/Sec: 12.60, Grad Norm: 0.0923 +[2025-02-20 22:34:49] (step=0004400) Train Loss: 0.1633, Train Steps/Sec: 12.62, Grad Norm: 0.0948 +[2025-02-20 22:34:57] (step=0004500) Train Loss: 0.1634, Train Steps/Sec: 12.62, Grad Norm: 0.0954 +[2025-02-20 22:35:06] (step=0004600) Train Loss: 0.1639, Train Steps/Sec: 12.16, Grad Norm: 0.0836 +[2025-02-20 22:35:15] (step=0004700) Train Loss: 0.1632, Train Steps/Sec: 10.30, Grad Norm: 0.0889 +[2025-02-20 22:35:25] (step=0004800) Train Loss: 0.1629, Train Steps/Sec: 10.07, Grad Norm: 0.0863 +[2025-02-20 22:35:33] (step=0004900) Train Loss: 0.1636, Train Steps/Sec: 12.61, Grad Norm: 0.0877 +[2025-02-20 22:35:42] (step=0005000) Train Loss: 0.1624, Train Steps/Sec: 11.38, Grad Norm: 0.0839 +[2025-02-20 22:35:51] (step=0005100) Train Loss: 0.1634, Train Steps/Sec: 11.32, Grad Norm: 0.0875 +[2025-02-20 22:35:59] (step=0005200) Train Loss: 0.1627, Train Steps/Sec: 12.62, Grad Norm: 0.0832 +[2025-02-20 22:36:07] (step=0005300) Train Loss: 0.1625, Train Steps/Sec: 12.62, Grad Norm: 0.0842 +[2025-02-20 22:36:15] (step=0005400) Train Loss: 0.1621, Train Steps/Sec: 12.60, Grad Norm: 0.0815 +[2025-02-20 22:36:22] (step=0005500) Train Loss: 0.1624, Train Steps/Sec: 12.61, Grad Norm: 0.0753 +[2025-02-20 22:36:30] (step=0005600) Train Loss: 0.1598, Train Steps/Sec: 12.63, Grad Norm: 0.0804 +[2025-02-20 22:36:38] (step=0005700) Train Loss: 0.1608, Train Steps/Sec: 12.61, Grad Norm: 0.0769 +[2025-02-20 22:36:46] (step=0005800) Train Loss: 0.1622, Train Steps/Sec: 12.62, Grad Norm: 0.0810 +[2025-02-20 22:36:54] (step=0005900) Train Loss: 0.1622, Train Steps/Sec: 12.62, Grad Norm: 0.0751 +[2025-02-20 22:37:02] (step=0006000) Train Loss: 0.1603, Train Steps/Sec: 12.62, Grad Norm: 0.0759 +[2025-02-20 22:37:10] (step=0006100) Train Loss: 0.1617, Train Steps/Sec: 12.62, Grad Norm: 0.0772 +[2025-02-20 22:37:18] (step=0006200) Train Loss: 0.1599, Train Steps/Sec: 12.61, Grad Norm: 0.0719 +[2025-02-20 22:37:27] (step=0006300) Train Loss: 0.1606, Train Steps/Sec: 11.31, Grad Norm: 0.0720 +[2025-02-20 22:37:35] (step=0006400) Train Loss: 0.1619, Train Steps/Sec: 12.58, Grad Norm: 0.0730 +[2025-02-20 22:37:43] (step=0006500) Train Loss: 0.1592, Train Steps/Sec: 12.57, Grad Norm: 0.0774 +[2025-02-20 22:37:51] (step=0006600) Train Loss: 0.1618, Train Steps/Sec: 12.62, Grad Norm: 0.0690 +[2025-02-20 22:37:59] (step=0006700) Train Loss: 0.1603, Train Steps/Sec: 12.59, Grad Norm: 0.0736 +[2025-02-20 22:38:07] (step=0006800) Train Loss: 0.1612, Train Steps/Sec: 11.71, Grad Norm: 0.0678 +[2025-02-20 22:38:17] (step=0006900) Train Loss: 0.1601, Train Steps/Sec: 10.00, Grad Norm: 0.0700 +[2025-02-20 22:38:26] (step=0007000) Train Loss: 0.1601, Train Steps/Sec: 11.31, Grad Norm: 0.0673 +[2025-02-20 22:38:34] (step=0007100) Train Loss: 0.1604, Train Steps/Sec: 11.81, Grad Norm: 0.0715 +[2025-02-20 22:38:43] (step=0007200) Train Loss: 0.1605, Train Steps/Sec: 11.72, Grad Norm: 0.0667 +[2025-02-20 22:38:51] (step=0007300) Train Loss: 0.1594, Train Steps/Sec: 12.60, Grad Norm: 0.0719 +[2025-02-20 22:38:59] (step=0007400) Train Loss: 0.1626, Train Steps/Sec: 12.57, Grad Norm: 0.0681 +[2025-02-20 22:39:07] (step=0007500) Train Loss: 0.1624, Train Steps/Sec: 12.49, Grad Norm: 0.0656 +[2025-02-20 22:39:16] (step=0007600) Train Loss: 0.1598, Train Steps/Sec: 11.30, Grad Norm: 0.0602 +[2025-02-20 22:39:24] (step=0007700) Train Loss: 0.1592, Train Steps/Sec: 12.57, Grad Norm: 0.0663 +[2025-02-20 22:39:32] (step=0007800) Train Loss: 0.1582, Train Steps/Sec: 12.60, Grad Norm: 0.0622 +[2025-02-20 22:39:40] (step=0007900) Train Loss: 0.1607, Train Steps/Sec: 12.59, Grad Norm: 0.0623 +[2025-02-20 22:39:48] (step=0008000) Train Loss: 0.1590, Train Steps/Sec: 12.60, Grad Norm: 0.0619 +[2025-02-20 22:39:55] (step=0008100) Train Loss: 0.1593, Train Steps/Sec: 12.60, Grad Norm: 0.0591 +[2025-02-20 22:40:03] (step=0008200) Train Loss: 0.1582, Train Steps/Sec: 12.59, Grad Norm: 0.0597 +[2025-02-20 22:40:11] (step=0008300) Train Loss: 0.1589, Train Steps/Sec: 12.60, Grad Norm: 0.0609 +[2025-02-20 22:40:19] (step=0008400) Train Loss: 0.1598, Train Steps/Sec: 12.60, Grad Norm: 0.0620 +[2025-02-20 22:40:27] (step=0008500) Train Loss: 0.1593, Train Steps/Sec: 12.60, Grad Norm: 0.0581 +[2025-02-20 22:40:35] (step=0008600) Train Loss: 0.1595, Train Steps/Sec: 12.61, Grad Norm: 0.0566 +[2025-02-20 22:40:43] (step=0008700) Train Loss: 0.1573, Train Steps/Sec: 12.61, Grad Norm: 0.0573 +[2025-02-20 22:40:52] (step=0008800) Train Loss: 0.1574, Train Steps/Sec: 11.22, Grad Norm: 0.0621 +[2025-02-20 22:41:00] (step=0008900) Train Loss: 0.1584, Train Steps/Sec: 12.62, Grad Norm: 0.0553 +[2025-02-20 22:41:09] (step=0009000) Train Loss: 0.1587, Train Steps/Sec: 11.32, Grad Norm: 0.0576 +[2025-02-20 22:41:19] (step=0009100) Train Loss: 0.1580, Train Steps/Sec: 9.97, Grad Norm: 0.0565 +[2025-02-20 22:41:28] (step=0009200) Train Loss: 0.1599, Train Steps/Sec: 11.02, Grad Norm: 0.0538 +[2025-02-20 22:41:37] (step=0009300) Train Loss: 0.1581, Train Steps/Sec: 11.05, Grad Norm: 0.0535 +[2025-02-20 22:41:45] (step=0009400) Train Loss: 0.1580, Train Steps/Sec: 12.64, Grad Norm: 0.0570 +[2025-02-20 22:41:53] (step=0009500) Train Loss: 0.1585, Train Steps/Sec: 12.63, Grad Norm: 0.0543 +[2025-02-20 22:42:01] (step=0009600) Train Loss: 0.1599, Train Steps/Sec: 12.64, Grad Norm: 0.0555 +[2025-02-20 22:42:09] (step=0009700) Train Loss: 0.1582, Train Steps/Sec: 12.65, Grad Norm: 0.0541 +[2025-02-20 22:42:16] (step=0009800) Train Loss: 0.1581, Train Steps/Sec: 12.61, Grad Norm: 0.0490 +[2025-02-20 22:42:24] (step=0009900) Train Loss: 0.1573, Train Steps/Sec: 12.60, Grad Norm: 0.0548 +[2025-02-20 22:42:32] (step=0010000) Train Loss: 0.1586, Train Steps/Sec: 12.53, Grad Norm: 0.0535 +[2025-02-20 22:42:41] (step=0010100) Train Loss: 0.1587, Train Steps/Sec: 11.31, Grad Norm: 0.0526 +[2025-02-20 22:42:49] (step=0010200) Train Loss: 0.1601, Train Steps/Sec: 12.62, Grad Norm: 0.0542 +[2025-02-20 22:42:57] (step=0010300) Train Loss: 0.1569, Train Steps/Sec: 12.64, Grad Norm: 0.0493 +[2025-02-20 22:43:05] (step=0010400) Train Loss: 0.1581, Train Steps/Sec: 12.63, Grad Norm: 0.0530 +[2025-02-20 22:43:13] (step=0010500) Train Loss: 0.1569, Train Steps/Sec: 12.65, Grad Norm: 0.0499 +[2025-02-20 22:43:21] (step=0010600) Train Loss: 0.1581, Train Steps/Sec: 12.64, Grad Norm: 0.0489 +[2025-02-20 22:43:29] (step=0010700) Train Loss: 0.1568, Train Steps/Sec: 12.64, Grad Norm: 0.0504 +[2025-02-20 22:43:37] (step=0010800) Train Loss: 0.1569, Train Steps/Sec: 12.63, Grad Norm: 0.0449 +[2025-02-20 22:43:45] (step=0010900) Train Loss: 0.1567, Train Steps/Sec: 12.64, Grad Norm: 0.0451 +[2025-02-20 22:43:52] (step=0011000) Train Loss: 0.1557, Train Steps/Sec: 12.64, Grad Norm: 0.0481 +[2025-02-20 22:44:00] (step=0011100) Train Loss: 0.1570, Train Steps/Sec: 12.64, Grad Norm: 0.0495 +[2025-02-20 22:44:09] (step=0011200) Train Loss: 0.1565, Train Steps/Sec: 11.71, Grad Norm: 0.0479 +[2025-02-20 22:44:20] (step=0011300) Train Loss: 0.1575, Train Steps/Sec: 9.21, Grad Norm: 0.0467 +[2025-02-20 22:44:30] (step=0011400) Train Loss: 0.1565, Train Steps/Sec: 10.10, Grad Norm: 0.0474 +[2025-02-20 22:44:38] (step=0011500) Train Loss: 0.1576, Train Steps/Sec: 12.61, Grad Norm: 0.0471 +[2025-02-20 22:44:46] (step=0011600) Train Loss: 0.1577, Train Steps/Sec: 12.63, Grad Norm: 0.0445 +[2025-02-20 22:44:53] (step=0011700) Train Loss: 0.1573, Train Steps/Sec: 12.63, Grad Norm: 0.0472 +[2025-02-20 22:45:01] (step=0011800) Train Loss: 0.1581, Train Steps/Sec: 12.62, Grad Norm: 0.0414 +[2025-02-20 22:45:09] (step=0011900) Train Loss: 0.1566, Train Steps/Sec: 12.64, Grad Norm: 0.0418 +[2025-02-20 22:45:17] (step=0012000) Train Loss: 0.1562, Train Steps/Sec: 12.64, Grad Norm: 0.0412 +[2025-02-20 22:45:25] (step=0012100) Train Loss: 0.1561, Train Steps/Sec: 12.64, Grad Norm: 0.0438 +[2025-02-20 22:45:33] (step=0012200) Train Loss: 0.1568, Train Steps/Sec: 12.65, Grad Norm: 0.0418 +[2025-02-20 22:45:41] (step=0012300) Train Loss: 0.1571, Train Steps/Sec: 12.62, Grad Norm: 0.0447 +[2025-02-20 22:45:49] (step=0012400) Train Loss: 0.1561, Train Steps/Sec: 12.62, Grad Norm: 0.0443 +[2025-02-20 22:45:57] (step=0012500) Train Loss: 0.1572, Train Steps/Sec: 12.56, Grad Norm: 0.0405 +[2025-02-20 22:46:06] (step=0012600) Train Loss: 0.1581, Train Steps/Sec: 11.34, Grad Norm: 0.0415 +[2025-02-20 22:46:14] (step=0012700) Train Loss: 0.1566, Train Steps/Sec: 12.64, Grad Norm: 0.0389 +[2025-02-20 22:46:21] (step=0012800) Train Loss: 0.1557, Train Steps/Sec: 12.64, Grad Norm: 0.0423 +[2025-02-20 22:46:29] (step=0012900) Train Loss: 0.1546, Train Steps/Sec: 12.65, Grad Norm: 0.0418 +[2025-02-20 22:46:37] (step=0013000) Train Loss: 0.1559, Train Steps/Sec: 12.65, Grad Norm: 0.0396 +[2025-02-20 22:46:45] (step=0013100) Train Loss: 0.1548, Train Steps/Sec: 12.65, Grad Norm: 0.0392 +[2025-02-20 22:46:53] (step=0013200) Train Loss: 0.1565, Train Steps/Sec: 12.62, Grad Norm: 0.0422 +[2025-02-20 22:47:01] (step=0013300) Train Loss: 0.1560, Train Steps/Sec: 12.64, Grad Norm: 0.0416 +[2025-02-20 22:47:10] (step=0013400) Train Loss: 0.1567, Train Steps/Sec: 11.30, Grad Norm: 0.0386 +[2025-02-20 22:47:21] (step=0013500) Train Loss: 0.1568, Train Steps/Sec: 9.00, Grad Norm: 0.0383 +[2025-02-20 22:47:31] (step=0013600) Train Loss: 0.1571, Train Steps/Sec: 10.08, Grad Norm: 0.0392 +[2025-02-20 22:47:39] (step=0013700) Train Loss: 0.1559, Train Steps/Sec: 12.64, Grad Norm: 0.0373 +[2025-02-20 22:47:48] (step=0013800) Train Loss: 0.1559, Train Steps/Sec: 11.27, Grad Norm: 0.0374 +[2025-02-20 22:47:56] (step=0013900) Train Loss: 0.1563, Train Steps/Sec: 12.64, Grad Norm: 0.0390 +[2025-02-20 22:48:04] (step=0014000) Train Loss: 0.1555, Train Steps/Sec: 12.64, Grad Norm: 0.0365 +[2025-02-20 22:48:11] (step=0014100) Train Loss: 0.1567, Train Steps/Sec: 12.65, Grad Norm: 0.0382 +[2025-02-20 22:48:19] (step=0014200) Train Loss: 0.1553, Train Steps/Sec: 12.64, Grad Norm: 0.0352 +[2025-02-20 22:48:27] (step=0014300) Train Loss: 0.1553, Train Steps/Sec: 12.63, Grad Norm: 0.0361 +[2025-02-20 22:48:35] (step=0014400) Train Loss: 0.1558, Train Steps/Sec: 12.65, Grad Norm: 0.0346 +[2025-02-20 22:48:43] (step=0014500) Train Loss: 0.1563, Train Steps/Sec: 12.64, Grad Norm: 0.0359 +[2025-02-20 22:48:51] (step=0014600) Train Loss: 0.1559, Train Steps/Sec: 12.65, Grad Norm: 0.0353 +[2025-02-20 22:48:59] (step=0014700) Train Loss: 0.1540, Train Steps/Sec: 12.64, Grad Norm: 0.0364 +[2025-02-20 22:49:07] (step=0014800) Train Loss: 0.1558, Train Steps/Sec: 12.65, Grad Norm: 0.0345 +[2025-02-20 22:49:15] (step=0014900) Train Loss: 0.1562, Train Steps/Sec: 12.64, Grad Norm: 0.0352 +[2025-02-20 22:49:23] (step=0015000) Train Loss: 0.1535, Train Steps/Sec: 12.58, Grad Norm: 0.0345 +[2025-02-20 22:49:32] (step=0015100) Train Loss: 0.1562, Train Steps/Sec: 11.33, Grad Norm: 0.0352 +[2025-02-20 22:49:39] (step=0015200) Train Loss: 0.1552, Train Steps/Sec: 12.58, Grad Norm: 0.0325 +[2025-02-20 22:49:47] (step=0015300) Train Loss: 0.1574, Train Steps/Sec: 12.56, Grad Norm: 0.0351 +[2025-02-20 22:49:55] (step=0015400) Train Loss: 0.1562, Train Steps/Sec: 12.54, Grad Norm: 0.0359 +[2025-02-20 22:50:03] (step=0015500) Train Loss: 0.1561, Train Steps/Sec: 12.56, Grad Norm: 0.0300 +[2025-02-20 22:50:13] (step=0015600) Train Loss: 0.1549, Train Steps/Sec: 10.66, Grad Norm: 0.0329 +[2025-02-20 22:50:25] (step=0015700) Train Loss: 0.1548, Train Steps/Sec: 8.02, Grad Norm: 0.0320 +[2025-02-20 22:50:34] (step=0015800) Train Loss: 0.1551, Train Steps/Sec: 11.40, Grad Norm: 0.0301 +[2025-02-20 22:50:42] (step=0015900) Train Loss: 0.1539, Train Steps/Sec: 12.62, Grad Norm: 0.0326 +[2025-02-20 22:50:50] (step=0016000) Train Loss: 0.1547, Train Steps/Sec: 12.63, Grad Norm: 0.0300 +[2025-02-20 22:50:58] (step=0016100) Train Loss: 0.1543, Train Steps/Sec: 12.63, Grad Norm: 0.0310 +[2025-02-20 22:51:06] (step=0016200) Train Loss: 0.1539, Train Steps/Sec: 12.62, Grad Norm: 0.0293 +[2025-02-20 22:51:15] (step=0016300) Train Loss: 0.1553, Train Steps/Sec: 11.28, Grad Norm: 0.0345 +[2025-02-20 22:51:22] (step=0016400) Train Loss: 0.1560, Train Steps/Sec: 12.63, Grad Norm: 0.0310 +[2025-02-20 22:51:30] (step=0016500) Train Loss: 0.1540, Train Steps/Sec: 12.64, Grad Norm: 0.0299 +[2025-02-20 22:51:38] (step=0016600) Train Loss: 0.1535, Train Steps/Sec: 12.62, Grad Norm: 0.0300 +[2025-02-20 22:51:46] (step=0016700) Train Loss: 0.1551, Train Steps/Sec: 12.62, Grad Norm: 0.0308 +[2025-02-20 22:51:54] (step=0016800) Train Loss: 0.1547, Train Steps/Sec: 12.65, Grad Norm: 0.0304 +[2025-02-20 22:52:02] (step=0016900) Train Loss: 0.1557, Train Steps/Sec: 12.64, Grad Norm: 0.0295 +[2025-02-20 22:52:10] (step=0017000) Train Loss: 0.1555, Train Steps/Sec: 12.64, Grad Norm: 0.0282 +[2025-02-20 22:52:18] (step=0017100) Train Loss: 0.1561, Train Steps/Sec: 12.64, Grad Norm: 0.0255 +[2025-02-20 22:52:26] (step=0017200) Train Loss: 0.1545, Train Steps/Sec: 12.64, Grad Norm: 0.0291 +[2025-02-20 22:52:34] (step=0017300) Train Loss: 0.1552, Train Steps/Sec: 12.63, Grad Norm: 0.0270 +[2025-02-20 22:52:42] (step=0017400) Train Loss: 0.1532, Train Steps/Sec: 12.65, Grad Norm: 0.0269 +[2025-02-20 22:52:50] (step=0017500) Train Loss: 0.1550, Train Steps/Sec: 12.58, Grad Norm: 0.0281 +[2025-02-20 22:52:58] (step=0017600) Train Loss: 0.1546, Train Steps/Sec: 11.34, Grad Norm: 0.0293 +[2025-02-20 22:53:06] (step=0017700) Train Loss: 0.1550, Train Steps/Sec: 12.58, Grad Norm: 0.0265 +[2025-02-20 22:53:18] (step=0017800) Train Loss: 0.1549, Train Steps/Sec: 8.72, Grad Norm: 0.0269 +[2025-02-20 22:53:30] (step=0017900) Train Loss: 0.1558, Train Steps/Sec: 8.56, Grad Norm: 0.0265 +[2025-02-20 22:53:37] (step=0018000) Train Loss: 0.1550, Train Steps/Sec: 12.61, Grad Norm: 0.0258 +[2025-02-20 22:53:45] (step=0018100) Train Loss: 0.1557, Train Steps/Sec: 12.61, Grad Norm: 0.0253 +[2025-02-20 22:53:53] (step=0018200) Train Loss: 0.1535, Train Steps/Sec: 12.59, Grad Norm: 0.0255 +[2025-02-20 22:54:01] (step=0018300) Train Loss: 0.1553, Train Steps/Sec: 12.60, Grad Norm: 0.0264 +[2025-02-20 22:54:09] (step=0018400) Train Loss: 0.1528, Train Steps/Sec: 12.60, Grad Norm: 0.0247 +[2025-02-20 22:54:17] (step=0018500) Train Loss: 0.1537, Train Steps/Sec: 12.58, Grad Norm: 0.0268 +[2025-02-20 22:54:25] (step=0018600) Train Loss: 0.1551, Train Steps/Sec: 12.58, Grad Norm: 0.0243 +[2025-02-20 22:54:33] (step=0018700) Train Loss: 0.1546, Train Steps/Sec: 12.59, Grad Norm: 0.0246 +[2025-02-20 22:54:42] (step=0018800) Train Loss: 0.1558, Train Steps/Sec: 11.26, Grad Norm: 0.0246 +[2025-02-20 22:54:50] (step=0018900) Train Loss: 0.1545, Train Steps/Sec: 12.63, Grad Norm: 0.0245 +[2025-02-20 22:54:58] (step=0019000) Train Loss: 0.1546, Train Steps/Sec: 12.63, Grad Norm: 0.0231 +[2025-02-20 22:55:06] (step=0019100) Train Loss: 0.1535, Train Steps/Sec: 12.63, Grad Norm: 0.0235 +[2025-02-20 22:55:14] (step=0019200) Train Loss: 0.1532, Train Steps/Sec: 12.61, Grad Norm: 0.0245 +[2025-02-20 22:55:22] (step=0019300) Train Loss: 0.1542, Train Steps/Sec: 12.62, Grad Norm: 0.0236 +[2025-02-20 22:55:29] (step=0019400) Train Loss: 0.1539, Train Steps/Sec: 12.63, Grad Norm: 0.0222 +[2025-02-20 22:55:37] (step=0019500) Train Loss: 0.1546, Train Steps/Sec: 12.63, Grad Norm: 0.0223 +[2025-02-20 22:55:45] (step=0019600) Train Loss: 0.1552, Train Steps/Sec: 12.64, Grad Norm: 0.0229 +[2025-02-20 22:55:53] (step=0019700) Train Loss: 0.1538, Train Steps/Sec: 12.64, Grad Norm: 0.0223 +[2025-02-20 22:56:01] (step=0019800) Train Loss: 0.1543, Train Steps/Sec: 12.64, Grad Norm: 0.0215 +[2025-02-20 22:56:10] (step=0019900) Train Loss: 0.1537, Train Steps/Sec: 11.45, Grad Norm: 0.0223 +[2025-02-20 22:56:21] (step=0020000) Train Loss: 0.1535, Train Steps/Sec: 9.19, Grad Norm: 0.0219 +[2025-02-20 22:56:26] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0020000.pt +[2025-02-20 22:56:38] (step=0020100) Train Loss: 0.1546, Train Steps/Sec: 5.69, Grad Norm: 0.0214 +[2025-02-20 22:56:46] (step=0020200) Train Loss: 0.1552, Train Steps/Sec: 12.59, Grad Norm: 0.0206 +[2025-02-20 22:56:54] (step=0020300) Train Loss: 0.1535, Train Steps/Sec: 12.59, Grad Norm: 0.0207 +[2025-02-20 22:57:02] (step=0020400) Train Loss: 0.1544, Train Steps/Sec: 12.59, Grad Norm: 0.0208 +[2025-02-20 22:57:10] (step=0020500) Train Loss: 0.1538, Train Steps/Sec: 12.61, Grad Norm: 0.0206 +[2025-02-20 22:57:18] (step=0020600) Train Loss: 0.1545, Train Steps/Sec: 12.60, Grad Norm: 0.0219 +[2025-02-20 22:57:26] (step=0020700) Train Loss: 0.1540, Train Steps/Sec: 12.60, Grad Norm: 0.0200 +[2025-02-20 22:57:34] (step=0020800) Train Loss: 0.1546, Train Steps/Sec: 12.60, Grad Norm: 0.0211 +[2025-02-20 22:57:42] (step=0020900) Train Loss: 0.1522, Train Steps/Sec: 12.61, Grad Norm: 0.0189 +[2025-02-20 22:57:50] (step=0021000) Train Loss: 0.1548, Train Steps/Sec: 12.59, Grad Norm: 0.0195 +[2025-02-20 22:57:58] (step=0021100) Train Loss: 0.1539, Train Steps/Sec: 12.60, Grad Norm: 0.0192 +[2025-02-20 22:58:06] (step=0021200) Train Loss: 0.1533, Train Steps/Sec: 12.60, Grad Norm: 0.0197 +[2025-02-20 22:58:15] (step=0021300) Train Loss: 0.1525, Train Steps/Sec: 11.25, Grad Norm: 0.0194 +[2025-02-20 22:58:22] (step=0021400) Train Loss: 0.1544, Train Steps/Sec: 12.60, Grad Norm: 0.0186 +[2025-02-20 22:58:30] (step=0021500) Train Loss: 0.1534, Train Steps/Sec: 12.60, Grad Norm: 0.0191 +[2025-02-20 22:58:38] (step=0021600) Train Loss: 0.1539, Train Steps/Sec: 12.61, Grad Norm: 0.0187 +[2025-02-20 22:58:46] (step=0021700) Train Loss: 0.1524, Train Steps/Sec: 12.61, Grad Norm: 0.0190 +[2025-02-20 22:58:54] (step=0021800) Train Loss: 0.1539, Train Steps/Sec: 12.63, Grad Norm: 0.0180 +[2025-02-20 22:59:02] (step=0021900) Train Loss: 0.1537, Train Steps/Sec: 12.64, Grad Norm: 0.0192 +[2025-02-20 22:59:10] (step=0022000) Train Loss: 0.1546, Train Steps/Sec: 12.64, Grad Norm: 0.0177 +[2025-02-20 22:59:20] (step=0022100) Train Loss: 0.1545, Train Steps/Sec: 10.06, Grad Norm: 0.0185 +[2025-02-20 22:59:31] (step=0022200) Train Loss: 0.1534, Train Steps/Sec: 9.42, Grad Norm: 0.0177 +[2025-02-20 22:59:41] (step=0022300) Train Loss: 0.1541, Train Steps/Sec: 9.24, Grad Norm: 0.0183 +[2025-02-20 22:59:49] (step=0022400) Train Loss: 0.1539, Train Steps/Sec: 12.59, Grad Norm: 0.0180 +[2025-02-20 22:59:57] (step=0022500) Train Loss: 0.1534, Train Steps/Sec: 12.55, Grad Norm: 0.0171 +[2025-02-20 23:00:06] (step=0022600) Train Loss: 0.1537, Train Steps/Sec: 11.34, Grad Norm: 0.0181 +[2025-02-20 23:00:14] (step=0022700) Train Loss: 0.1529, Train Steps/Sec: 12.59, Grad Norm: 0.0160 +[2025-02-20 23:00:22] (step=0022800) Train Loss: 0.1556, Train Steps/Sec: 12.60, Grad Norm: 0.0171 +[2025-02-20 23:00:30] (step=0022900) Train Loss: 0.1537, Train Steps/Sec: 12.59, Grad Norm: 0.0168 +[2025-02-20 23:00:38] (step=0023000) Train Loss: 0.1526, Train Steps/Sec: 12.60, Grad Norm: 0.0171 +[2025-02-20 23:00:46] (step=0023100) Train Loss: 0.1533, Train Steps/Sec: 12.60, Grad Norm: 0.0175 +[2025-02-20 23:00:54] (step=0023200) Train Loss: 0.1529, Train Steps/Sec: 12.58, Grad Norm: 0.0165 +[2025-02-20 23:01:02] (step=0023300) Train Loss: 0.1520, Train Steps/Sec: 12.54, Grad Norm: 0.0165 +[2025-02-20 23:01:10] (step=0023400) Train Loss: 0.1541, Train Steps/Sec: 12.59, Grad Norm: 0.0160 +[2025-02-20 23:01:18] (step=0023500) Train Loss: 0.1531, Train Steps/Sec: 12.58, Grad Norm: 0.0160 +[2025-02-20 23:01:26] (step=0023600) Train Loss: 0.1529, Train Steps/Sec: 12.57, Grad Norm: 0.0159 +[2025-02-20 23:01:34] (step=0023700) Train Loss: 0.1528, Train Steps/Sec: 12.61, Grad Norm: 0.0155 +[2025-02-20 23:01:42] (step=0023800) Train Loss: 0.1535, Train Steps/Sec: 11.30, Grad Norm: 0.0159 +[2025-02-20 23:01:50] (step=0023900) Train Loss: 0.1527, Train Steps/Sec: 12.56, Grad Norm: 0.0161 +[2025-02-20 23:01:58] (step=0024000) Train Loss: 0.1537, Train Steps/Sec: 12.57, Grad Norm: 0.0158 +[2025-02-20 23:02:06] (step=0024100) Train Loss: 0.1537, Train Steps/Sec: 12.52, Grad Norm: 0.0156 +[2025-02-20 23:02:16] (step=0024200) Train Loss: 0.1532, Train Steps/Sec: 10.33, Grad Norm: 0.0163 +[2025-02-20 23:02:24] (step=0024300) Train Loss: 0.1538, Train Steps/Sec: 12.02, Grad Norm: 0.0151 +[2025-02-20 23:02:36] (step=0024400) Train Loss: 0.1531, Train Steps/Sec: 8.68, Grad Norm: 0.0153 +[2025-02-20 23:02:46] (step=0024500) Train Loss: 0.1540, Train Steps/Sec: 9.45, Grad Norm: 0.0150 +[2025-02-20 23:02:54] (step=0024600) Train Loss: 0.1531, Train Steps/Sec: 12.50, Grad Norm: 0.0155 +[2025-02-20 23:03:02] (step=0024700) Train Loss: 0.1532, Train Steps/Sec: 12.53, Grad Norm: 0.0146 +[2025-02-20 23:03:10] (step=0024800) Train Loss: 0.1533, Train Steps/Sec: 12.55, Grad Norm: 0.0143 +[2025-02-20 23:03:18] (step=0024900) Train Loss: 0.1533, Train Steps/Sec: 12.53, Grad Norm: 0.0158 +[2025-02-20 23:03:26] (step=0025000) Train Loss: 0.1534, Train Steps/Sec: 12.49, Grad Norm: 0.0143 +[2025-02-20 23:03:35] (step=0025100) Train Loss: 0.1519, Train Steps/Sec: 11.34, Grad Norm: 0.0155 +[2025-02-20 23:03:43] (step=0025200) Train Loss: 0.1543, Train Steps/Sec: 12.58, Grad Norm: 0.0143 +[2025-02-20 23:03:51] (step=0025300) Train Loss: 0.1524, Train Steps/Sec: 12.56, Grad Norm: 0.0145 +[2025-02-20 23:03:59] (step=0025400) Train Loss: 0.1534, Train Steps/Sec: 12.60, Grad Norm: 0.0150 +[2025-02-20 23:04:07] (step=0025500) Train Loss: 0.1533, Train Steps/Sec: 12.58, Grad Norm: 0.0144 +[2025-02-20 23:04:15] (step=0025600) Train Loss: 0.1540, Train Steps/Sec: 12.59, Grad Norm: 0.0143 +[2025-02-20 23:04:23] (step=0025700) Train Loss: 0.1521, Train Steps/Sec: 12.60, Grad Norm: 0.0136 +[2025-02-20 23:04:31] (step=0025800) Train Loss: 0.1514, Train Steps/Sec: 12.57, Grad Norm: 0.0140 +[2025-02-20 23:04:39] (step=0025900) Train Loss: 0.1540, Train Steps/Sec: 12.51, Grad Norm: 0.0141 +[2025-02-20 23:04:47] (step=0026000) Train Loss: 0.1533, Train Steps/Sec: 12.51, Grad Norm: 0.0136 +[2025-02-20 23:04:55] (step=0026100) Train Loss: 0.1510, Train Steps/Sec: 12.57, Grad Norm: 0.0150 +[2025-02-20 23:05:03] (step=0026200) Train Loss: 0.1526, Train Steps/Sec: 12.58, Grad Norm: 0.0150 +[2025-02-20 23:05:13] (step=0026300) Train Loss: 0.1529, Train Steps/Sec: 9.66, Grad Norm: 0.0138 +[2025-02-20 23:05:22] (step=0026400) Train Loss: 0.1521, Train Steps/Sec: 11.76, Grad Norm: 0.0137 +[2025-02-20 23:05:30] (step=0026500) Train Loss: 0.1531, Train Steps/Sec: 12.09, Grad Norm: 0.0138 +[2025-02-20 23:05:42] (step=0026600) Train Loss: 0.1524, Train Steps/Sec: 8.47, Grad Norm: 0.0130 +[2025-02-20 23:05:51] (step=0026700) Train Loss: 0.1540, Train Steps/Sec: 10.64, Grad Norm: 0.0140 +[2025-02-20 23:05:59] (step=0026800) Train Loss: 0.1538, Train Steps/Sec: 12.56, Grad Norm: 0.0139 +[2025-02-20 23:06:07] (step=0026900) Train Loss: 0.1535, Train Steps/Sec: 12.57, Grad Norm: 0.0135 +[2025-02-20 23:06:15] (step=0027000) Train Loss: 0.1525, Train Steps/Sec: 12.56, Grad Norm: 0.0130 +[2025-02-20 23:06:23] (step=0027100) Train Loss: 0.1524, Train Steps/Sec: 12.56, Grad Norm: 0.0134 +[2025-02-20 23:06:31] (step=0027200) Train Loss: 0.1536, Train Steps/Sec: 12.57, Grad Norm: 0.0132 +[2025-02-20 23:06:39] (step=0027300) Train Loss: 0.1533, Train Steps/Sec: 12.58, Grad Norm: 0.0137 +[2025-02-20 23:06:47] (step=0027400) Train Loss: 0.1528, Train Steps/Sec: 12.57, Grad Norm: 0.0149 +[2025-02-20 23:06:55] (step=0027500) Train Loss: 0.1528, Train Steps/Sec: 12.51, Grad Norm: 0.0133 +[2025-02-20 23:07:04] (step=0027600) Train Loss: 0.1529, Train Steps/Sec: 11.32, Grad Norm: 0.0131 +[2025-02-20 23:07:11] (step=0027700) Train Loss: 0.1521, Train Steps/Sec: 12.61, Grad Norm: 0.0129 +[2025-02-20 23:07:19] (step=0027800) Train Loss: 0.1530, Train Steps/Sec: 12.62, Grad Norm: 0.0132 +[2025-02-20 23:07:27] (step=0027900) Train Loss: 0.1518, Train Steps/Sec: 12.62, Grad Norm: 0.0127 +[2025-02-20 23:07:35] (step=0028000) Train Loss: 0.1531, Train Steps/Sec: 12.61, Grad Norm: 0.0127 +[2025-02-20 23:07:43] (step=0028100) Train Loss: 0.1520, Train Steps/Sec: 12.62, Grad Norm: 0.0131 +[2025-02-20 23:07:51] (step=0028200) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0130 +[2025-02-20 23:07:59] (step=0028300) Train Loss: 0.1512, Train Steps/Sec: 12.61, Grad Norm: 0.0129 +[2025-02-20 23:08:07] (step=0028400) Train Loss: 0.1520, Train Steps/Sec: 12.61, Grad Norm: 0.0134 +[2025-02-20 23:08:17] (step=0028500) Train Loss: 0.1527, Train Steps/Sec: 10.13, Grad Norm: 0.0133 +[2025-02-20 23:08:25] (step=0028600) Train Loss: 0.1508, Train Steps/Sec: 12.60, Grad Norm: 0.0131 +[2025-02-20 23:08:33] (step=0028700) Train Loss: 0.1526, Train Steps/Sec: 11.70, Grad Norm: 0.0135 +[2025-02-20 23:08:46] (step=0028800) Train Loss: 0.1526, Train Steps/Sec: 7.63, Grad Norm: 0.0124 +[2025-02-20 23:08:55] (step=0028900) Train Loss: 0.1528, Train Steps/Sec: 11.76, Grad Norm: 0.0132 +[2025-02-20 23:09:03] (step=0029000) Train Loss: 0.1521, Train Steps/Sec: 12.60, Grad Norm: 0.0124 +[2025-02-20 23:09:11] (step=0029100) Train Loss: 0.1522, Train Steps/Sec: 12.60, Grad Norm: 0.0125 +[2025-02-20 23:09:19] (step=0029200) Train Loss: 0.1537, Train Steps/Sec: 12.61, Grad Norm: 0.0131 +[2025-02-20 23:09:27] (step=0029300) Train Loss: 0.1519, Train Steps/Sec: 12.61, Grad Norm: 0.0124 +[2025-02-20 23:09:35] (step=0029400) Train Loss: 0.1518, Train Steps/Sec: 12.63, Grad Norm: 0.0126 +[2025-02-20 23:09:43] (step=0029500) Train Loss: 0.1530, Train Steps/Sec: 12.62, Grad Norm: 0.0128 +[2025-02-20 23:09:50] (step=0029600) Train Loss: 0.1511, Train Steps/Sec: 12.62, Grad Norm: 0.0132 +[2025-02-20 23:09:58] (step=0029700) Train Loss: 0.1538, Train Steps/Sec: 12.63, Grad Norm: 0.0135 +[2025-02-20 23:10:06] (step=0029800) Train Loss: 0.1514, Train Steps/Sec: 12.62, Grad Norm: 0.0128 +[2025-02-20 23:10:14] (step=0029900) Train Loss: 0.1531, Train Steps/Sec: 12.61, Grad Norm: 0.0124 +[2025-02-20 23:10:22] (step=0030000) Train Loss: 0.1530, Train Steps/Sec: 12.54, Grad Norm: 0.0125 +[2025-02-20 23:10:31] (step=0030100) Train Loss: 0.1533, Train Steps/Sec: 11.29, Grad Norm: 0.0121 +[2025-02-20 23:10:39] (step=0030200) Train Loss: 0.1520, Train Steps/Sec: 12.58, Grad Norm: 0.0126 +[2025-02-20 23:10:47] (step=0030300) Train Loss: 0.1529, Train Steps/Sec: 12.48, Grad Norm: 0.0122 +[2025-02-20 23:10:55] (step=0030400) Train Loss: 0.1515, Train Steps/Sec: 12.59, Grad Norm: 0.0122 +[2025-02-20 23:11:03] (step=0030500) Train Loss: 0.1524, Train Steps/Sec: 12.62, Grad Norm: 0.0124 +[2025-02-20 23:11:12] (step=0030600) Train Loss: 0.1532, Train Steps/Sec: 10.45, Grad Norm: 0.0128 +[2025-02-20 23:11:21] (step=0030700) Train Loss: 0.1517, Train Steps/Sec: 12.21, Grad Norm: 0.0121 +[2025-02-20 23:11:29] (step=0030800) Train Loss: 0.1517, Train Steps/Sec: 12.59, Grad Norm: 0.0122 +[2025-02-20 23:11:37] (step=0030900) Train Loss: 0.1524, Train Steps/Sec: 11.30, Grad Norm: 0.0120 +[2025-02-20 23:11:49] (step=0031000) Train Loss: 0.1532, Train Steps/Sec: 8.51, Grad Norm: 0.0119 +[2025-02-20 23:11:58] (step=0031100) Train Loss: 0.1522, Train Steps/Sec: 11.74, Grad Norm: 0.0125 +[2025-02-20 23:12:06] (step=0031200) Train Loss: 0.1522, Train Steps/Sec: 12.58, Grad Norm: 0.0119 +[2025-02-20 23:12:15] (step=0031300) Train Loss: 0.1522, Train Steps/Sec: 11.22, Grad Norm: 0.0118 +[2025-02-20 23:12:23] (step=0031400) Train Loss: 0.1516, Train Steps/Sec: 12.59, Grad Norm: 0.0123 +[2025-02-20 23:12:30] (step=0031500) Train Loss: 0.1522, Train Steps/Sec: 12.59, Grad Norm: 0.0134 +[2025-02-20 23:12:38] (step=0031600) Train Loss: 0.1531, Train Steps/Sec: 12.60, Grad Norm: 0.0120 +[2025-02-20 23:12:46] (step=0031700) Train Loss: 0.1529, Train Steps/Sec: 12.56, Grad Norm: 0.0120 +[2025-02-20 23:12:54] (step=0031800) Train Loss: 0.1525, Train Steps/Sec: 12.54, Grad Norm: 0.0118 +[2025-02-20 23:13:02] (step=0031900) Train Loss: 0.1523, Train Steps/Sec: 12.55, Grad Norm: 0.0125 +[2025-02-20 23:13:10] (step=0032000) Train Loss: 0.1526, Train Steps/Sec: 12.53, Grad Norm: 0.0120 +[2025-02-20 23:13:18] (step=0032100) Train Loss: 0.1519, Train Steps/Sec: 12.57, Grad Norm: 0.0116 +[2025-02-20 23:13:26] (step=0032200) Train Loss: 0.1517, Train Steps/Sec: 12.53, Grad Norm: 0.0120 +[2025-02-20 23:13:34] (step=0032300) Train Loss: 0.1529, Train Steps/Sec: 12.55, Grad Norm: 0.0117 +[2025-02-20 23:13:42] (step=0032400) Train Loss: 0.1518, Train Steps/Sec: 12.57, Grad Norm: 0.0117 +[2025-02-20 23:13:50] (step=0032500) Train Loss: 0.1524, Train Steps/Sec: 12.48, Grad Norm: 0.0122 +[2025-02-20 23:13:59] (step=0032600) Train Loss: 0.1507, Train Steps/Sec: 11.30, Grad Norm: 0.0123 +[2025-02-20 23:14:07] (step=0032700) Train Loss: 0.1524, Train Steps/Sec: 12.20, Grad Norm: 0.0123 +[2025-02-20 23:14:17] (step=0032800) Train Loss: 0.1522, Train Steps/Sec: 10.43, Grad Norm: 0.0117 +[2025-02-20 23:14:25] (step=0032900) Train Loss: 0.1527, Train Steps/Sec: 12.57, Grad Norm: 0.0117 +[2025-02-20 23:14:33] (step=0033000) Train Loss: 0.1526, Train Steps/Sec: 12.59, Grad Norm: 0.0118 +[2025-02-20 23:14:42] (step=0033100) Train Loss: 0.1510, Train Steps/Sec: 10.54, Grad Norm: 0.0118 +[2025-02-20 23:14:54] (step=0033200) Train Loss: 0.1515, Train Steps/Sec: 8.80, Grad Norm: 0.0115 +[2025-02-20 23:15:02] (step=0033300) Train Loss: 0.1509, Train Steps/Sec: 12.60, Grad Norm: 0.0114 +[2025-02-20 23:15:09] (step=0033400) Train Loss: 0.1526, Train Steps/Sec: 12.60, Grad Norm: 0.0122 +[2025-02-20 23:15:17] (step=0033500) Train Loss: 0.1508, Train Steps/Sec: 12.57, Grad Norm: 0.0119 +[2025-02-20 23:15:25] (step=0033600) Train Loss: 0.1528, Train Steps/Sec: 12.59, Grad Norm: 0.0121 +[2025-02-20 23:15:33] (step=0033700) Train Loss: 0.1527, Train Steps/Sec: 12.58, Grad Norm: 0.0121 +[2025-02-20 23:15:42] (step=0033800) Train Loss: 0.1520, Train Steps/Sec: 11.25, Grad Norm: 0.0113 +[2025-02-20 23:15:50] (step=0033900) Train Loss: 0.1519, Train Steps/Sec: 12.61, Grad Norm: 0.0114 +[2025-02-20 23:15:58] (step=0034000) Train Loss: 0.1516, Train Steps/Sec: 12.62, Grad Norm: 0.0115 +[2025-02-20 23:16:06] (step=0034100) Train Loss: 0.1535, Train Steps/Sec: 12.61, Grad Norm: 0.0118 +[2025-02-20 23:16:14] (step=0034200) Train Loss: 0.1511, Train Steps/Sec: 12.62, Grad Norm: 0.0120 +[2025-02-20 23:16:22] (step=0034300) Train Loss: 0.1525, Train Steps/Sec: 12.61, Grad Norm: 0.0115 +[2025-02-20 23:16:30] (step=0034400) Train Loss: 0.1519, Train Steps/Sec: 12.59, Grad Norm: 0.0113 +[2025-02-20 23:16:38] (step=0034500) Train Loss: 0.1518, Train Steps/Sec: 12.58, Grad Norm: 0.0114 +[2025-02-20 23:16:46] (step=0034600) Train Loss: 0.1521, Train Steps/Sec: 12.57, Grad Norm: 0.0120 +[2025-02-20 23:16:54] (step=0034700) Train Loss: 0.1528, Train Steps/Sec: 12.61, Grad Norm: 0.0116 +[2025-02-20 23:17:02] (step=0034800) Train Loss: 0.1521, Train Steps/Sec: 12.60, Grad Norm: 0.0119 +[2025-02-20 23:17:12] (step=0034900) Train Loss: 0.1517, Train Steps/Sec: 10.07, Grad Norm: 0.0113 +[2025-02-20 23:17:19] (step=0035000) Train Loss: 0.1513, Train Steps/Sec: 12.53, Grad Norm: 0.0110 +[2025-02-20 23:17:28] (step=0035100) Train Loss: 0.1533, Train Steps/Sec: 11.22, Grad Norm: 0.0115 +[2025-02-20 23:17:36] (step=0035200) Train Loss: 0.1512, Train Steps/Sec: 12.57, Grad Norm: 0.0116 +[2025-02-20 23:17:47] (step=0035300) Train Loss: 0.1512, Train Steps/Sec: 9.67, Grad Norm: 0.0109 +[2025-02-20 23:17:58] (step=0035400) Train Loss: 0.1532, Train Steps/Sec: 9.02, Grad Norm: 0.0112 +[2025-02-20 23:18:06] (step=0035500) Train Loss: 0.1514, Train Steps/Sec: 12.63, Grad Norm: 0.0112 +[2025-02-20 23:18:14] (step=0035600) Train Loss: 0.1511, Train Steps/Sec: 12.62, Grad Norm: 0.0113 +[2025-02-20 23:18:22] (step=0035700) Train Loss: 0.1523, Train Steps/Sec: 12.61, Grad Norm: 0.0120 +[2025-02-20 23:18:29] (step=0035800) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0108 +[2025-02-20 23:18:37] (step=0035900) Train Loss: 0.1523, Train Steps/Sec: 12.61, Grad Norm: 0.0110 +[2025-02-20 23:18:45] (step=0036000) Train Loss: 0.1513, Train Steps/Sec: 12.63, Grad Norm: 0.0115 +[2025-02-20 23:18:53] (step=0036100) Train Loss: 0.1525, Train Steps/Sec: 12.63, Grad Norm: 0.0115 +[2025-02-20 23:19:01] (step=0036200) Train Loss: 0.1522, Train Steps/Sec: 12.63, Grad Norm: 0.0110 +[2025-02-20 23:19:10] (step=0036300) Train Loss: 0.1521, Train Steps/Sec: 11.27, Grad Norm: 0.0116 +[2025-02-20 23:19:18] (step=0036400) Train Loss: 0.1514, Train Steps/Sec: 12.57, Grad Norm: 0.0114 +[2025-02-20 23:19:26] (step=0036500) Train Loss: 0.1518, Train Steps/Sec: 12.58, Grad Norm: 0.0115 +[2025-02-20 23:19:34] (step=0036600) Train Loss: 0.1518, Train Steps/Sec: 12.59, Grad Norm: 0.0108 +[2025-02-20 23:19:42] (step=0036700) Train Loss: 0.1512, Train Steps/Sec: 12.57, Grad Norm: 0.0118 +[2025-02-20 23:19:50] (step=0036800) Train Loss: 0.1512, Train Steps/Sec: 12.60, Grad Norm: 0.0111 +[2025-02-20 23:19:58] (step=0036900) Train Loss: 0.1516, Train Steps/Sec: 12.59, Grad Norm: 0.0107 +[2025-02-20 23:20:07] (step=0037000) Train Loss: 0.1506, Train Steps/Sec: 11.06, Grad Norm: 0.0113 +[2025-02-20 23:20:15] (step=0037100) Train Loss: 0.1525, Train Steps/Sec: 11.75, Grad Norm: 0.0113 +[2025-02-20 23:20:23] (step=0037200) Train Loss: 0.1505, Train Steps/Sec: 12.62, Grad Norm: 0.0114 +[2025-02-20 23:20:31] (step=0037300) Train Loss: 0.1502, Train Steps/Sec: 12.62, Grad Norm: 0.0113 +[2025-02-20 23:20:39] (step=0037400) Train Loss: 0.1522, Train Steps/Sec: 12.59, Grad Norm: 0.0107 +[2025-02-20 23:20:49] (step=0037500) Train Loss: 0.1518, Train Steps/Sec: 9.62, Grad Norm: 0.0107 +[2025-02-20 23:21:01] (step=0037600) Train Loss: 0.1526, Train Steps/Sec: 8.38, Grad Norm: 0.0113 +[2025-02-20 23:21:09] (step=0037700) Train Loss: 0.1527, Train Steps/Sec: 12.60, Grad Norm: 0.0109 +[2025-02-20 23:21:17] (step=0037800) Train Loss: 0.1527, Train Steps/Sec: 12.59, Grad Norm: 0.0108 +[2025-02-20 23:21:25] (step=0037900) Train Loss: 0.1521, Train Steps/Sec: 12.58, Grad Norm: 0.0107 +[2025-02-20 23:21:33] (step=0038000) Train Loss: 0.1509, Train Steps/Sec: 12.56, Grad Norm: 0.0107 +[2025-02-20 23:21:41] (step=0038100) Train Loss: 0.1514, Train Steps/Sec: 12.58, Grad Norm: 0.0115 +[2025-02-20 23:21:49] (step=0038200) Train Loss: 0.1508, Train Steps/Sec: 12.61, Grad Norm: 0.0108 +[2025-02-20 23:21:57] (step=0038300) Train Loss: 0.1525, Train Steps/Sec: 12.60, Grad Norm: 0.0111 +[2025-02-20 23:22:05] (step=0038400) Train Loss: 0.1511, Train Steps/Sec: 12.60, Grad Norm: 0.0114 +[2025-02-20 23:22:13] (step=0038500) Train Loss: 0.1517, Train Steps/Sec: 12.60, Grad Norm: 0.0111 +[2025-02-20 23:22:21] (step=0038600) Train Loss: 0.1517, Train Steps/Sec: 12.60, Grad Norm: 0.0112 +[2025-02-20 23:22:29] (step=0038700) Train Loss: 0.1513, Train Steps/Sec: 12.60, Grad Norm: 0.0106 +[2025-02-20 23:22:38] (step=0038800) Train Loss: 0.1523, Train Steps/Sec: 11.24, Grad Norm: 0.0108 +[2025-02-20 23:22:46] (step=0038900) Train Loss: 0.1524, Train Steps/Sec: 12.58, Grad Norm: 0.0111 +[2025-02-20 23:22:54] (step=0039000) Train Loss: 0.1519, Train Steps/Sec: 12.60, Grad Norm: 0.0108 +[2025-02-20 23:23:02] (step=0039100) Train Loss: 0.1511, Train Steps/Sec: 12.20, Grad Norm: 0.0109 +[2025-02-20 23:23:11] (step=0039200) Train Loss: 0.1513, Train Steps/Sec: 10.73, Grad Norm: 0.0107 +[2025-02-20 23:23:19] (step=0039300) Train Loss: 0.1520, Train Steps/Sec: 12.60, Grad Norm: 0.0111 +[2025-02-20 23:23:27] (step=0039400) Train Loss: 0.1492, Train Steps/Sec: 12.58, Grad Norm: 0.0106 +[2025-02-20 23:23:35] (step=0039500) Train Loss: 0.1525, Train Steps/Sec: 12.62, Grad Norm: 0.0114 +[2025-02-20 23:23:43] (step=0039600) Train Loss: 0.1518, Train Steps/Sec: 12.62, Grad Norm: 0.0108 +[2025-02-20 23:23:54] (step=0039700) Train Loss: 0.1528, Train Steps/Sec: 9.16, Grad Norm: 0.0110 +[2025-02-20 23:24:04] (step=0039800) Train Loss: 0.1516, Train Steps/Sec: 9.46, Grad Norm: 0.0109 +[2025-02-20 23:24:12] (step=0039900) Train Loss: 0.1518, Train Steps/Sec: 12.61, Grad Norm: 0.0107 +[2025-02-20 23:24:20] (step=0040000) Train Loss: 0.1524, Train Steps/Sec: 12.55, Grad Norm: 0.0110 +[2025-02-20 23:24:24] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0040000.pt +[2025-02-20 23:24:33] (step=0040100) Train Loss: 0.1509, Train Steps/Sec: 7.77, Grad Norm: 0.0105 +[2025-02-20 23:24:41] (step=0040200) Train Loss: 0.1508, Train Steps/Sec: 12.59, Grad Norm: 0.0105 +[2025-02-20 23:24:49] (step=0040300) Train Loss: 0.1511, Train Steps/Sec: 12.58, Grad Norm: 0.0109 +[2025-02-20 23:24:57] (step=0040400) Train Loss: 0.1513, Train Steps/Sec: 12.58, Grad Norm: 0.0107 +[2025-02-20 23:25:05] (step=0040500) Train Loss: 0.1519, Train Steps/Sec: 12.60, Grad Norm: 0.0105 +[2025-02-20 23:25:13] (step=0040600) Train Loss: 0.1524, Train Steps/Sec: 12.60, Grad Norm: 0.0104 +[2025-02-20 23:25:21] (step=0040700) Train Loss: 0.1506, Train Steps/Sec: 12.61, Grad Norm: 0.0105 +[2025-02-20 23:25:29] (step=0040800) Train Loss: 0.1515, Train Steps/Sec: 12.59, Grad Norm: 0.0108 +[2025-02-20 23:25:37] (step=0040900) Train Loss: 0.1515, Train Steps/Sec: 12.60, Grad Norm: 0.0108 +[2025-02-20 23:25:45] (step=0041000) Train Loss: 0.1512, Train Steps/Sec: 12.59, Grad Norm: 0.0102 +[2025-02-20 23:25:53] (step=0041100) Train Loss: 0.1513, Train Steps/Sec: 12.61, Grad Norm: 0.0108 +[2025-02-20 23:26:00] (step=0041200) Train Loss: 0.1513, Train Steps/Sec: 12.59, Grad Norm: 0.0104 +[2025-02-20 23:26:11] (step=0041300) Train Loss: 0.1500, Train Steps/Sec: 9.54, Grad Norm: 0.0109 +[2025-02-20 23:26:19] (step=0041400) Train Loss: 0.1518, Train Steps/Sec: 11.77, Grad Norm: 0.0103 +[2025-02-20 23:26:27] (step=0041500) Train Loss: 0.1494, Train Steps/Sec: 12.62, Grad Norm: 0.0106 +[2025-02-20 23:26:35] (step=0041600) Train Loss: 0.1531, Train Steps/Sec: 12.63, Grad Norm: 0.0110 +[2025-02-20 23:26:43] (step=0041700) Train Loss: 0.1512, Train Steps/Sec: 12.64, Grad Norm: 0.0106 +[2025-02-20 23:26:51] (step=0041800) Train Loss: 0.1524, Train Steps/Sec: 12.16, Grad Norm: 0.0103 +[2025-02-20 23:27:02] (step=0041900) Train Loss: 0.1521, Train Steps/Sec: 9.20, Grad Norm: 0.0107 +[2025-02-20 23:27:13] (step=0042000) Train Loss: 0.1524, Train Steps/Sec: 9.26, Grad Norm: 0.0109 +[2025-02-20 23:27:21] (step=0042100) Train Loss: 0.1512, Train Steps/Sec: 12.61, Grad Norm: 0.0101 +[2025-02-20 23:27:29] (step=0042200) Train Loss: 0.1507, Train Steps/Sec: 12.62, Grad Norm: 0.0108 +[2025-02-20 23:27:37] (step=0042300) Train Loss: 0.1507, Train Steps/Sec: 12.62, Grad Norm: 0.0105 +[2025-02-20 23:27:45] (step=0042400) Train Loss: 0.1511, Train Steps/Sec: 12.60, Grad Norm: 0.0098 +[2025-02-20 23:27:53] (step=0042500) Train Loss: 0.1519, Train Steps/Sec: 12.60, Grad Norm: 0.0103 +[2025-02-20 23:28:02] (step=0042600) Train Loss: 0.1520, Train Steps/Sec: 11.28, Grad Norm: 0.0109 +[2025-02-20 23:28:10] (step=0042700) Train Loss: 0.1523, Train Steps/Sec: 12.60, Grad Norm: 0.0104 +[2025-02-20 23:28:17] (step=0042800) Train Loss: 0.1519, Train Steps/Sec: 12.61, Grad Norm: 0.0104 +[2025-02-20 23:28:25] (step=0042900) Train Loss: 0.1522, Train Steps/Sec: 12.61, Grad Norm: 0.0099 +[2025-02-20 23:28:33] (step=0043000) Train Loss: 0.1523, Train Steps/Sec: 12.60, Grad Norm: 0.0101 +[2025-02-20 23:28:41] (step=0043100) Train Loss: 0.1520, Train Steps/Sec: 12.60, Grad Norm: 0.0104 +[2025-02-20 23:28:49] (step=0043200) Train Loss: 0.1514, Train Steps/Sec: 12.61, Grad Norm: 0.0102 +[2025-02-20 23:28:57] (step=0043300) Train Loss: 0.1504, Train Steps/Sec: 12.60, Grad Norm: 0.0104 +[2025-02-20 23:29:06] (step=0043400) Train Loss: 0.1520, Train Steps/Sec: 11.80, Grad Norm: 0.0099 +[2025-02-20 23:29:15] (step=0043500) Train Loss: 0.1510, Train Steps/Sec: 10.44, Grad Norm: 0.0101 +[2025-02-20 23:29:23] (step=0043600) Train Loss: 0.1518, Train Steps/Sec: 12.60, Grad Norm: 0.0103 +[2025-02-20 23:29:31] (step=0043700) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0101 +[2025-02-20 23:29:40] (step=0043800) Train Loss: 0.1504, Train Steps/Sec: 11.27, Grad Norm: 0.0107 +[2025-02-20 23:29:48] (step=0043900) Train Loss: 0.1514, Train Steps/Sec: 12.59, Grad Norm: 0.0102 +[2025-02-20 23:29:56] (step=0044000) Train Loss: 0.1515, Train Steps/Sec: 12.13, Grad Norm: 0.0103 +[2025-02-20 23:30:09] (step=0044100) Train Loss: 0.1513, Train Steps/Sec: 7.91, Grad Norm: 0.0105 +[2025-02-20 23:30:18] (step=0044200) Train Loss: 0.1512, Train Steps/Sec: 10.67, Grad Norm: 0.0108 +[2025-02-20 23:30:26] (step=0044300) Train Loss: 0.1513, Train Steps/Sec: 12.59, Grad Norm: 0.0103 +[2025-02-20 23:30:34] (step=0044400) Train Loss: 0.1518, Train Steps/Sec: 12.59, Grad Norm: 0.0099 +[2025-02-20 23:30:42] (step=0044500) Train Loss: 0.1512, Train Steps/Sec: 12.59, Grad Norm: 0.0100 +[2025-02-20 23:30:50] (step=0044600) Train Loss: 0.1510, Train Steps/Sec: 12.61, Grad Norm: 0.0102 +[2025-02-20 23:30:58] (step=0044700) Train Loss: 0.1529, Train Steps/Sec: 12.61, Grad Norm: 0.0104 +[2025-02-20 23:31:06] (step=0044800) Train Loss: 0.1526, Train Steps/Sec: 12.63, Grad Norm: 0.0098 +[2025-02-20 23:31:14] (step=0044900) Train Loss: 0.1507, Train Steps/Sec: 12.63, Grad Norm: 0.0098 +[2025-02-20 23:31:22] (step=0045000) Train Loss: 0.1520, Train Steps/Sec: 12.63, Grad Norm: 0.0100 +[2025-02-20 23:31:30] (step=0045100) Train Loss: 0.1514, Train Steps/Sec: 11.27, Grad Norm: 0.0101 +[2025-02-20 23:31:38] (step=0045200) Train Loss: 0.1510, Train Steps/Sec: 12.60, Grad Norm: 0.0097 +[2025-02-20 23:31:46] (step=0045300) Train Loss: 0.1505, Train Steps/Sec: 12.60, Grad Norm: 0.0104 +[2025-02-20 23:31:54] (step=0045400) Train Loss: 0.1515, Train Steps/Sec: 12.60, Grad Norm: 0.0100 +[2025-02-20 23:32:02] (step=0045500) Train Loss: 0.1510, Train Steps/Sec: 12.60, Grad Norm: 0.0106 +[2025-02-20 23:32:12] (step=0045600) Train Loss: 0.1508, Train Steps/Sec: 9.83, Grad Norm: 0.0101 +[2025-02-20 23:32:20] (step=0045700) Train Loss: 0.1502, Train Steps/Sec: 12.59, Grad Norm: 0.0099 +[2025-02-20 23:32:28] (step=0045800) Train Loss: 0.1521, Train Steps/Sec: 12.55, Grad Norm: 0.0101 +[2025-02-20 23:32:36] (step=0045900) Train Loss: 0.1515, Train Steps/Sec: 12.59, Grad Norm: 0.0101 +[2025-02-20 23:32:44] (step=0046000) Train Loss: 0.1513, Train Steps/Sec: 12.58, Grad Norm: 0.0101 +[2025-02-20 23:32:52] (step=0046100) Train Loss: 0.1509, Train Steps/Sec: 12.57, Grad Norm: 0.0101 +[2025-02-20 23:33:00] (step=0046200) Train Loss: 0.1501, Train Steps/Sec: 12.10, Grad Norm: 0.0097 +[2025-02-20 23:33:16] (step=0046300) Train Loss: 0.1494, Train Steps/Sec: 6.34, Grad Norm: 0.0099 +[2025-02-20 23:33:25] (step=0046400) Train Loss: 0.1501, Train Steps/Sec: 11.74, Grad Norm: 0.0098 +[2025-02-20 23:33:33] (step=0046500) Train Loss: 0.1502, Train Steps/Sec: 12.58, Grad Norm: 0.0100 +[2025-02-20 23:33:42] (step=0046600) Train Loss: 0.1527, Train Steps/Sec: 11.32, Grad Norm: 0.0100 +[2025-02-20 23:33:49] (step=0046700) Train Loss: 0.1514, Train Steps/Sec: 12.55, Grad Norm: 0.0103 +[2025-02-20 23:33:57] (step=0046800) Train Loss: 0.1506, Train Steps/Sec: 12.58, Grad Norm: 0.0102 +[2025-02-20 23:34:05] (step=0046900) Train Loss: 0.1499, Train Steps/Sec: 12.61, Grad Norm: 0.0096 +[2025-02-20 23:34:13] (step=0047000) Train Loss: 0.1502, Train Steps/Sec: 12.60, Grad Norm: 0.0100 +[2025-02-20 23:34:21] (step=0047100) Train Loss: 0.1504, Train Steps/Sec: 12.61, Grad Norm: 0.0102 +[2025-02-20 23:34:29] (step=0047200) Train Loss: 0.1498, Train Steps/Sec: 12.59, Grad Norm: 0.0102 +[2025-02-20 23:34:37] (step=0047300) Train Loss: 0.1500, Train Steps/Sec: 12.62, Grad Norm: 0.0097 +[2025-02-20 23:34:45] (step=0047400) Train Loss: 0.1505, Train Steps/Sec: 12.61, Grad Norm: 0.0100 +[2025-02-20 23:34:53] (step=0047500) Train Loss: 0.1500, Train Steps/Sec: 12.61, Grad Norm: 0.0101 +[2025-02-20 23:35:02] (step=0047600) Train Loss: 0.1512, Train Steps/Sec: 11.28, Grad Norm: 0.0098 +[2025-02-20 23:35:11] (step=0047700) Train Loss: 0.1514, Train Steps/Sec: 10.40, Grad Norm: 0.0099 +[2025-02-20 23:35:20] (step=0047800) Train Loss: 0.1495, Train Steps/Sec: 11.78, Grad Norm: 0.0097 +[2025-02-20 23:35:28] (step=0047900) Train Loss: 0.1503, Train Steps/Sec: 12.62, Grad Norm: 0.0103 +[2025-02-20 23:35:36] (step=0048000) Train Loss: 0.1505, Train Steps/Sec: 12.62, Grad Norm: 0.0096 +[2025-02-20 23:35:44] (step=0048100) Train Loss: 0.1501, Train Steps/Sec: 12.63, Grad Norm: 0.0095 +[2025-02-20 23:35:52] (step=0048200) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0095 +[2025-02-20 23:36:00] (step=0048300) Train Loss: 0.1515, Train Steps/Sec: 12.59, Grad Norm: 0.0097 +[2025-02-20 23:36:08] (step=0048400) Train Loss: 0.1513, Train Steps/Sec: 11.29, Grad Norm: 0.0100 +[2025-02-20 23:36:21] (step=0048500) Train Loss: 0.1488, Train Steps/Sec: 7.90, Grad Norm: 0.0098 +[2025-02-20 23:36:30] (step=0048600) Train Loss: 0.1519, Train Steps/Sec: 11.76, Grad Norm: 0.0101 +[2025-02-20 23:36:38] (step=0048700) Train Loss: 0.1502, Train Steps/Sec: 12.60, Grad Norm: 0.0095 +[2025-02-20 23:36:46] (step=0048800) Train Loss: 0.1518, Train Steps/Sec: 11.25, Grad Norm: 0.0098 +[2025-02-20 23:36:54] (step=0048900) Train Loss: 0.1515, Train Steps/Sec: 12.58, Grad Norm: 0.0096 +[2025-02-20 23:37:02] (step=0049000) Train Loss: 0.1507, Train Steps/Sec: 12.59, Grad Norm: 0.0097 +[2025-02-20 23:37:10] (step=0049100) Train Loss: 0.1499, Train Steps/Sec: 12.60, Grad Norm: 0.0096 +[2025-02-20 23:37:18] (step=0049200) Train Loss: 0.1498, Train Steps/Sec: 12.59, Grad Norm: 0.0096 +[2025-02-20 23:37:26] (step=0049300) Train Loss: 0.1499, Train Steps/Sec: 12.58, Grad Norm: 0.0095 +[2025-02-20 23:37:34] (step=0049400) Train Loss: 0.1514, Train Steps/Sec: 12.60, Grad Norm: 0.0095 +[2025-02-20 23:37:42] (step=0049500) Train Loss: 0.1514, Train Steps/Sec: 12.61, Grad Norm: 0.0097 +[2025-02-20 23:37:50] (step=0049600) Train Loss: 0.1511, Train Steps/Sec: 12.62, Grad Norm: 0.0099 +[2025-02-20 23:37:58] (step=0049700) Train Loss: 0.1493, Train Steps/Sec: 12.59, Grad Norm: 0.0097 +[2025-02-20 23:38:07] (step=0049800) Train Loss: 0.1495, Train Steps/Sec: 11.39, Grad Norm: 0.0093 +[2025-02-20 23:38:16] (step=0049900) Train Loss: 0.1512, Train Steps/Sec: 10.77, Grad Norm: 0.0101 +[2025-02-20 23:38:24] (step=0050000) Train Loss: 0.1519, Train Steps/Sec: 12.61, Grad Norm: 0.0096 +[2025-02-20 23:38:33] (step=0050100) Train Loss: 0.1514, Train Steps/Sec: 11.26, Grad Norm: 0.0097 +[2025-02-20 23:38:41] (step=0050200) Train Loss: 0.1496, Train Steps/Sec: 12.61, Grad Norm: 0.0094 +[2025-02-20 23:38:49] (step=0050300) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0101 +[2025-02-20 23:38:57] (step=0050400) Train Loss: 0.1495, Train Steps/Sec: 12.63, Grad Norm: 0.0094 +[2025-02-20 23:39:04] (step=0050500) Train Loss: 0.1508, Train Steps/Sec: 12.63, Grad Norm: 0.0093 +[2025-02-20 23:39:14] (step=0050600) Train Loss: 0.1497, Train Steps/Sec: 10.62, Grad Norm: 0.0097 +[2025-02-20 23:39:26] (step=0050700) Train Loss: 0.1504, Train Steps/Sec: 8.52, Grad Norm: 0.0095 +[2025-02-20 23:39:34] (step=0050800) Train Loss: 0.1513, Train Steps/Sec: 11.78, Grad Norm: 0.0096 +[2025-02-20 23:39:42] (step=0050900) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0092 +[2025-02-20 23:39:50] (step=0051000) Train Loss: 0.1510, Train Steps/Sec: 12.64, Grad Norm: 0.0096 +[2025-02-20 23:39:58] (step=0051100) Train Loss: 0.1493, Train Steps/Sec: 12.61, Grad Norm: 0.0092 +[2025-02-20 23:40:06] (step=0051200) Train Loss: 0.1518, Train Steps/Sec: 12.62, Grad Norm: 0.0096 +[2025-02-20 23:40:15] (step=0051300) Train Loss: 0.1498, Train Steps/Sec: 11.28, Grad Norm: 0.0099 +[2025-02-20 23:40:23] (step=0051400) Train Loss: 0.1496, Train Steps/Sec: 12.58, Grad Norm: 0.0094 +[2025-02-20 23:40:31] (step=0051500) Train Loss: 0.1503, Train Steps/Sec: 12.60, Grad Norm: 0.0096 +[2025-02-20 23:40:39] (step=0051600) Train Loss: 0.1510, Train Steps/Sec: 12.57, Grad Norm: 0.0095 +[2025-02-20 23:40:46] (step=0051700) Train Loss: 0.1504, Train Steps/Sec: 12.60, Grad Norm: 0.0094 +[2025-02-20 23:40:54] (step=0051800) Train Loss: 0.1506, Train Steps/Sec: 12.57, Grad Norm: 0.0091 +[2025-02-20 23:41:02] (step=0051900) Train Loss: 0.1511, Train Steps/Sec: 12.61, Grad Norm: 0.0095 +[2025-02-20 23:41:12] (step=0052000) Train Loss: 0.1513, Train Steps/Sec: 10.15, Grad Norm: 0.0102 +[2025-02-20 23:41:20] (step=0052100) Train Loss: 0.1504, Train Steps/Sec: 12.15, Grad Norm: 0.0093 +[2025-02-20 23:41:28] (step=0052200) Train Loss: 0.1498, Train Steps/Sec: 12.60, Grad Norm: 0.0096 +[2025-02-20 23:41:36] (step=0052300) Train Loss: 0.1500, Train Steps/Sec: 12.60, Grad Norm: 0.0094 +[2025-02-20 23:41:44] (step=0052400) Train Loss: 0.1506, Train Steps/Sec: 12.60, Grad Norm: 0.0093 +[2025-02-20 23:41:52] (step=0052500) Train Loss: 0.1513, Train Steps/Sec: 12.60, Grad Norm: 0.0094 +[2025-02-20 23:42:01] (step=0052600) Train Loss: 0.1490, Train Steps/Sec: 11.26, Grad Norm: 0.0093 +[2025-02-20 23:42:09] (step=0052700) Train Loss: 0.1503, Train Steps/Sec: 12.59, Grad Norm: 0.0096 +[2025-02-20 23:42:19] (step=0052800) Train Loss: 0.1513, Train Steps/Sec: 10.26, Grad Norm: 0.0096 +[2025-02-20 23:42:31] (step=0052900) Train Loss: 0.1501, Train Steps/Sec: 8.50, Grad Norm: 0.0092 +[2025-02-20 23:42:39] (step=0053000) Train Loss: 0.1520, Train Steps/Sec: 12.17, Grad Norm: 0.0094 +[2025-02-20 23:42:47] (step=0053100) Train Loss: 0.1497, Train Steps/Sec: 12.61, Grad Norm: 0.0095 +[2025-02-20 23:42:55] (step=0053200) Train Loss: 0.1510, Train Steps/Sec: 12.62, Grad Norm: 0.0095 +[2025-02-20 23:43:03] (step=0053300) Train Loss: 0.1523, Train Steps/Sec: 12.57, Grad Norm: 0.0096 +[2025-02-20 23:43:11] (step=0053400) Train Loss: 0.1514, Train Steps/Sec: 12.58, Grad Norm: 0.0093 +[2025-02-20 23:43:18] (step=0053500) Train Loss: 0.1511, Train Steps/Sec: 12.58, Grad Norm: 0.0092 +[2025-02-20 23:43:26] (step=0053600) Train Loss: 0.1499, Train Steps/Sec: 12.59, Grad Norm: 0.0092 +[2025-02-20 23:43:34] (step=0053700) Train Loss: 0.1514, Train Steps/Sec: 12.58, Grad Norm: 0.0095 +[2025-02-20 23:43:43] (step=0053800) Train Loss: 0.1497, Train Steps/Sec: 11.28, Grad Norm: 0.0095 +[2025-02-20 23:43:51] (step=0053900) Train Loss: 0.1502, Train Steps/Sec: 12.60, Grad Norm: 0.0094 +[2025-02-20 23:43:59] (step=0054000) Train Loss: 0.1506, Train Steps/Sec: 12.61, Grad Norm: 0.0095 +[2025-02-20 23:44:08] (step=0054100) Train Loss: 0.1510, Train Steps/Sec: 11.07, Grad Norm: 0.0094 +[2025-02-20 23:44:17] (step=0054200) Train Loss: 0.1504, Train Steps/Sec: 11.78, Grad Norm: 0.0092 +[2025-02-20 23:44:25] (step=0054300) Train Loss: 0.1503, Train Steps/Sec: 12.60, Grad Norm: 0.0095 +[2025-02-20 23:44:32] (step=0054400) Train Loss: 0.1505, Train Steps/Sec: 12.61, Grad Norm: 0.0090 +[2025-02-20 23:44:40] (step=0054500) Train Loss: 0.1509, Train Steps/Sec: 12.63, Grad Norm: 0.0095 +[2025-02-20 23:44:48] (step=0054600) Train Loss: 0.1507, Train Steps/Sec: 12.63, Grad Norm: 0.0091 +[2025-02-20 23:44:56] (step=0054700) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0097 +[2025-02-20 23:45:04] (step=0054800) Train Loss: 0.1491, Train Steps/Sec: 12.63, Grad Norm: 0.0091 +[2025-02-20 23:45:12] (step=0054900) Train Loss: 0.1503, Train Steps/Sec: 12.62, Grad Norm: 0.0094 +[2025-02-20 23:45:22] (step=0055000) Train Loss: 0.1505, Train Steps/Sec: 9.68, Grad Norm: 0.0090 +[2025-02-20 23:45:35] (step=0055100) Train Loss: 0.1518, Train Steps/Sec: 7.90, Grad Norm: 0.0091 +[2025-02-20 23:45:43] (step=0055200) Train Loss: 0.1518, Train Steps/Sec: 12.61, Grad Norm: 0.0097 +[2025-02-20 23:45:51] (step=0055300) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0093 +[2025-02-20 23:45:59] (step=0055400) Train Loss: 0.1508, Train Steps/Sec: 12.62, Grad Norm: 0.0095 +[2025-02-20 23:46:07] (step=0055500) Train Loss: 0.1498, Train Steps/Sec: 12.62, Grad Norm: 0.0095 +[2025-02-20 23:46:15] (step=0055600) Train Loss: 0.1514, Train Steps/Sec: 12.62, Grad Norm: 0.0089 +[2025-02-20 23:46:23] (step=0055700) Train Loss: 0.1512, Train Steps/Sec: 12.60, Grad Norm: 0.0093 +[2025-02-20 23:46:31] (step=0055800) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0089 +[2025-02-20 23:46:39] (step=0055900) Train Loss: 0.1504, Train Steps/Sec: 12.63, Grad Norm: 0.0093 +[2025-02-20 23:46:46] (step=0056000) Train Loss: 0.1509, Train Steps/Sec: 12.63, Grad Norm: 0.0091 +[2025-02-20 23:46:54] (step=0056100) Train Loss: 0.1498, Train Steps/Sec: 12.61, Grad Norm: 0.0090 +[2025-02-20 23:47:03] (step=0056200) Train Loss: 0.1499, Train Steps/Sec: 12.16, Grad Norm: 0.0091 +[2025-02-20 23:47:13] (step=0056300) Train Loss: 0.1498, Train Steps/Sec: 10.07, Grad Norm: 0.0095 +[2025-02-20 23:47:21] (step=0056400) Train Loss: 0.1504, Train Steps/Sec: 12.11, Grad Norm: 0.0092 +[2025-02-20 23:47:29] (step=0056500) Train Loss: 0.1510, Train Steps/Sec: 12.56, Grad Norm: 0.0091 +[2025-02-20 23:47:37] (step=0056600) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0093 +[2025-02-20 23:47:45] (step=0056700) Train Loss: 0.1508, Train Steps/Sec: 12.56, Grad Norm: 0.0091 +[2025-02-20 23:47:53] (step=0056800) Train Loss: 0.1496, Train Steps/Sec: 12.59, Grad Norm: 0.0090 +[2025-02-20 23:48:01] (step=0056900) Train Loss: 0.1516, Train Steps/Sec: 12.59, Grad Norm: 0.0090 +[2025-02-20 23:48:08] (step=0057000) Train Loss: 0.1512, Train Steps/Sec: 12.60, Grad Norm: 0.0090 +[2025-02-20 23:48:17] (step=0057100) Train Loss: 0.1495, Train Steps/Sec: 12.13, Grad Norm: 0.0093 +[2025-02-20 23:48:27] (step=0057200) Train Loss: 0.1498, Train Steps/Sec: 9.70, Grad Norm: 0.0092 +[2025-02-20 23:48:38] (step=0057300) Train Loss: 0.1508, Train Steps/Sec: 8.96, Grad Norm: 0.0091 +[2025-02-20 23:48:46] (step=0057400) Train Loss: 0.1507, Train Steps/Sec: 12.09, Grad Norm: 0.0092 +[2025-02-20 23:48:54] (step=0057500) Train Loss: 0.1493, Train Steps/Sec: 12.60, Grad Norm: 0.0091 +[2025-02-20 23:49:03] (step=0057600) Train Loss: 0.1505, Train Steps/Sec: 11.27, Grad Norm: 0.0088 +[2025-02-20 23:49:11] (step=0057700) Train Loss: 0.1505, Train Steps/Sec: 12.59, Grad Norm: 0.0092 +[2025-02-20 23:49:19] (step=0057800) Train Loss: 0.1508, Train Steps/Sec: 12.55, Grad Norm: 0.0093 +[2025-02-20 23:49:27] (step=0057900) Train Loss: 0.1518, Train Steps/Sec: 12.58, Grad Norm: 0.0097 +[2025-02-20 23:49:35] (step=0058000) Train Loss: 0.1495, Train Steps/Sec: 12.60, Grad Norm: 0.0090 +[2025-02-20 23:49:43] (step=0058100) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0088 +[2025-02-20 23:49:51] (step=0058200) Train Loss: 0.1507, Train Steps/Sec: 12.58, Grad Norm: 0.0092 +[2025-02-20 23:49:59] (step=0058300) Train Loss: 0.1508, Train Steps/Sec: 12.59, Grad Norm: 0.0094 +[2025-02-20 23:50:08] (step=0058400) Train Loss: 0.1496, Train Steps/Sec: 10.74, Grad Norm: 0.0086 +[2025-02-20 23:50:16] (step=0058500) Train Loss: 0.1506, Train Steps/Sec: 12.15, Grad Norm: 0.0094 +[2025-02-20 23:50:24] (step=0058600) Train Loss: 0.1509, Train Steps/Sec: 12.60, Grad Norm: 0.0089 +[2025-02-20 23:50:32] (step=0058700) Train Loss: 0.1510, Train Steps/Sec: 12.61, Grad Norm: 0.0089 +[2025-02-20 23:50:41] (step=0058800) Train Loss: 0.1498, Train Steps/Sec: 11.28, Grad Norm: 0.0090 +[2025-02-20 23:50:49] (step=0058900) Train Loss: 0.1490, Train Steps/Sec: 12.57, Grad Norm: 0.0091 +[2025-02-20 23:50:57] (step=0059000) Train Loss: 0.1511, Train Steps/Sec: 12.61, Grad Norm: 0.0090 +[2025-02-20 23:51:05] (step=0059100) Train Loss: 0.1504, Train Steps/Sec: 12.61, Grad Norm: 0.0088 +[2025-02-20 23:51:13] (step=0059200) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0087 +[2025-02-20 23:51:21] (step=0059300) Train Loss: 0.1494, Train Steps/Sec: 12.17, Grad Norm: 0.0088 +[2025-02-20 23:51:33] (step=0059400) Train Loss: 0.1501, Train Steps/Sec: 8.32, Grad Norm: 0.0092 +[2025-02-20 23:51:43] (step=0059500) Train Loss: 0.1503, Train Steps/Sec: 9.76, Grad Norm: 0.0092 +[2025-02-20 23:51:51] (step=0059600) Train Loss: 0.1507, Train Steps/Sec: 12.62, Grad Norm: 0.0090 +[2025-02-20 23:51:59] (step=0059700) Train Loss: 0.1499, Train Steps/Sec: 12.61, Grad Norm: 0.0088 +[2025-02-20 23:52:07] (step=0059800) Train Loss: 0.1511, Train Steps/Sec: 12.60, Grad Norm: 0.0092 +[2025-02-20 23:52:15] (step=0059900) Train Loss: 0.1502, Train Steps/Sec: 12.62, Grad Norm: 0.0086 +[2025-02-20 23:52:23] (step=0060000) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0088 +[2025-02-20 23:52:25] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0060000.pt +[2025-02-20 23:52:34] (step=0060100) Train Loss: 0.1505, Train Steps/Sec: 8.97, Grad Norm: 0.0088 +[2025-02-20 23:52:42] (step=0060200) Train Loss: 0.1498, Train Steps/Sec: 12.60, Grad Norm: 0.0087 +[2025-02-20 23:52:50] (step=0060300) Train Loss: 0.1502, Train Steps/Sec: 12.62, Grad Norm: 0.0089 +[2025-02-20 23:52:58] (step=0060400) Train Loss: 0.1494, Train Steps/Sec: 12.62, Grad Norm: 0.0090 +[2025-02-20 23:53:07] (step=0060500) Train Loss: 0.1503, Train Steps/Sec: 11.08, Grad Norm: 0.0094 +[2025-02-20 23:53:16] (step=0060600) Train Loss: 0.1510, Train Steps/Sec: 11.78, Grad Norm: 0.0092 +[2025-02-20 23:53:23] (step=0060700) Train Loss: 0.1512, Train Steps/Sec: 12.62, Grad Norm: 0.0091 +[2025-02-20 23:53:31] (step=0060800) Train Loss: 0.1501, Train Steps/Sec: 12.62, Grad Norm: 0.0088 +[2025-02-20 23:53:39] (step=0060900) Train Loss: 0.1500, Train Steps/Sec: 12.61, Grad Norm: 0.0087 +[2025-02-20 23:53:47] (step=0061000) Train Loss: 0.1498, Train Steps/Sec: 12.61, Grad Norm: 0.0089 +[2025-02-20 23:53:55] (step=0061100) Train Loss: 0.1514, Train Steps/Sec: 12.62, Grad Norm: 0.0088 +[2025-02-20 23:54:03] (step=0061200) Train Loss: 0.1498, Train Steps/Sec: 12.62, Grad Norm: 0.0085 +[2025-02-20 23:54:12] (step=0061300) Train Loss: 0.1508, Train Steps/Sec: 11.27, Grad Norm: 0.0091 +[2025-02-20 23:54:20] (step=0061400) Train Loss: 0.1494, Train Steps/Sec: 12.59, Grad Norm: 0.0087 +[2025-02-20 23:54:29] (step=0061500) Train Loss: 0.1490, Train Steps/Sec: 11.32, Grad Norm: 0.0089 +[2025-02-20 23:54:40] (step=0061600) Train Loss: 0.1504, Train Steps/Sec: 8.96, Grad Norm: 0.0086 +[2025-02-20 23:54:50] (step=0061700) Train Loss: 0.1480, Train Steps/Sec: 9.75, Grad Norm: 0.0090 +[2025-02-20 23:54:58] (step=0061800) Train Loss: 0.1505, Train Steps/Sec: 12.62, Grad Norm: 0.0085 +[2025-02-20 23:55:06] (step=0061900) Train Loss: 0.1482, Train Steps/Sec: 12.63, Grad Norm: 0.0088 +[2025-02-20 23:55:14] (step=0062000) Train Loss: 0.1496, Train Steps/Sec: 12.62, Grad Norm: 0.0090 +[2025-02-20 23:55:22] (step=0062100) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0090 +[2025-02-20 23:55:30] (step=0062200) Train Loss: 0.1490, Train Steps/Sec: 12.63, Grad Norm: 0.0087 +[2025-02-20 23:55:38] (step=0062300) Train Loss: 0.1493, Train Steps/Sec: 12.63, Grad Norm: 0.0089 +[2025-02-20 23:55:46] (step=0062400) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0089 +[2025-02-20 23:55:54] (step=0062500) Train Loss: 0.1496, Train Steps/Sec: 12.62, Grad Norm: 0.0087 +[2025-02-20 23:56:03] (step=0062600) Train Loss: 0.1511, Train Steps/Sec: 10.91, Grad Norm: 0.0086 +[2025-02-20 23:56:12] (step=0062700) Train Loss: 0.1514, Train Steps/Sec: 10.73, Grad Norm: 0.0089 +[2025-02-20 23:56:20] (step=0062800) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0088 +[2025-02-20 23:56:28] (step=0062900) Train Loss: 0.1512, Train Steps/Sec: 12.60, Grad Norm: 0.0086 +[2025-02-20 23:56:36] (step=0063000) Train Loss: 0.1491, Train Steps/Sec: 12.60, Grad Norm: 0.0084 +[2025-02-20 23:56:44] (step=0063100) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0085 +[2025-02-20 23:56:52] (step=0063200) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0086 +[2025-02-20 23:57:00] (step=0063300) Train Loss: 0.1505, Train Steps/Sec: 12.61, Grad Norm: 0.0088 +[2025-02-20 23:57:08] (step=0063400) Train Loss: 0.1513, Train Steps/Sec: 12.60, Grad Norm: 0.0090 +[2025-02-20 23:57:16] (step=0063500) Train Loss: 0.1505, Train Steps/Sec: 12.62, Grad Norm: 0.0086 +[2025-02-20 23:57:23] (step=0063600) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0091 +[2025-02-20 23:57:32] (step=0063700) Train Loss: 0.1506, Train Steps/Sec: 11.34, Grad Norm: 0.0091 +[2025-02-20 23:57:43] (step=0063800) Train Loss: 0.1506, Train Steps/Sec: 8.97, Grad Norm: 0.0089 +[2025-02-20 23:57:54] (step=0063900) Train Loss: 0.1488, Train Steps/Sec: 9.42, Grad Norm: 0.0083 +[2025-02-20 23:58:02] (step=0064000) Train Loss: 0.1498, Train Steps/Sec: 12.59, Grad Norm: 0.0086 +[2025-02-20 23:58:10] (step=0064100) Train Loss: 0.1498, Train Steps/Sec: 12.60, Grad Norm: 0.0085 +[2025-02-20 23:58:18] (step=0064200) Train Loss: 0.1507, Train Steps/Sec: 12.61, Grad Norm: 0.0089 +[2025-02-20 23:58:26] (step=0064300) Train Loss: 0.1499, Train Steps/Sec: 12.62, Grad Norm: 0.0089 +[2025-02-20 23:58:34] (step=0064400) Train Loss: 0.1494, Train Steps/Sec: 12.62, Grad Norm: 0.0088 +[2025-02-20 23:58:42] (step=0064500) Train Loss: 0.1503, Train Steps/Sec: 12.63, Grad Norm: 0.0089 +[2025-02-20 23:58:50] (step=0064600) Train Loss: 0.1503, Train Steps/Sec: 12.61, Grad Norm: 0.0089 +[2025-02-20 23:58:58] (step=0064700) Train Loss: 0.1484, Train Steps/Sec: 12.59, Grad Norm: 0.0086 +[2025-02-20 23:59:07] (step=0064800) Train Loss: 0.1499, Train Steps/Sec: 10.74, Grad Norm: 0.0084 +[2025-02-20 23:59:15] (step=0064900) Train Loss: 0.1495, Train Steps/Sec: 12.16, Grad Norm: 0.0088 +[2025-02-20 23:59:23] (step=0065000) Train Loss: 0.1505, Train Steps/Sec: 12.63, Grad Norm: 0.0089 +[2025-02-20 23:59:32] (step=0065100) Train Loss: 0.1507, Train Steps/Sec: 11.25, Grad Norm: 0.0086 +[2025-02-20 23:59:40] (step=0065200) Train Loss: 0.1491, Train Steps/Sec: 12.59, Grad Norm: 0.0084 +[2025-02-20 23:59:48] (step=0065300) Train Loss: 0.1508, Train Steps/Sec: 12.60, Grad Norm: 0.0085 +[2025-02-20 23:59:56] (step=0065400) Train Loss: 0.1496, Train Steps/Sec: 12.60, Grad Norm: 0.0083 +[2025-02-21 00:00:04] (step=0065500) Train Loss: 0.1511, Train Steps/Sec: 12.57, Grad Norm: 0.0089 +[2025-02-21 00:00:12] (step=0065600) Train Loss: 0.1512, Train Steps/Sec: 12.58, Grad Norm: 0.0087 +[2025-02-21 00:00:20] (step=0065700) Train Loss: 0.1503, Train Steps/Sec: 12.58, Grad Norm: 0.0088 +[2025-02-21 00:00:27] (step=0065800) Train Loss: 0.1493, Train Steps/Sec: 12.58, Grad Norm: 0.0090 +[2025-02-21 00:00:37] (step=0065900) Train Loss: 0.1509, Train Steps/Sec: 10.28, Grad Norm: 0.0084 +[2025-02-21 00:00:48] (step=0066000) Train Loss: 0.1505, Train Steps/Sec: 8.96, Grad Norm: 0.0089 +[2025-02-21 00:00:57] (step=0066100) Train Loss: 0.1497, Train Steps/Sec: 11.32, Grad Norm: 0.0085 +[2025-02-21 00:01:05] (step=0066200) Train Loss: 0.1504, Train Steps/Sec: 12.61, Grad Norm: 0.0088 +[2025-02-21 00:01:13] (step=0066300) Train Loss: 0.1505, Train Steps/Sec: 12.56, Grad Norm: 0.0086 +[2025-02-21 00:01:22] (step=0066400) Train Loss: 0.1497, Train Steps/Sec: 11.23, Grad Norm: 0.0087 +[2025-02-21 00:01:30] (step=0066500) Train Loss: 0.1518, Train Steps/Sec: 12.58, Grad Norm: 0.0087 +[2025-02-21 00:01:38] (step=0066600) Train Loss: 0.1509, Train Steps/Sec: 12.62, Grad Norm: 0.0084 +[2025-02-21 00:01:46] (step=0066700) Train Loss: 0.1497, Train Steps/Sec: 12.56, Grad Norm: 0.0084 +[2025-02-21 00:01:54] (step=0066800) Train Loss: 0.1505, Train Steps/Sec: 12.54, Grad Norm: 0.0088 +[2025-02-21 00:02:03] (step=0066900) Train Loss: 0.1495, Train Steps/Sec: 10.73, Grad Norm: 0.0087 +[2025-02-21 00:02:12] (step=0067000) Train Loss: 0.1489, Train Steps/Sec: 11.40, Grad Norm: 0.0085 +[2025-02-21 00:02:20] (step=0067100) Train Loss: 0.1502, Train Steps/Sec: 12.60, Grad Norm: 0.0089 +[2025-02-21 00:02:28] (step=0067200) Train Loss: 0.1502, Train Steps/Sec: 12.62, Grad Norm: 0.0085 +[2025-02-21 00:02:36] (step=0067300) Train Loss: 0.1509, Train Steps/Sec: 12.63, Grad Norm: 0.0086 +[2025-02-21 00:02:44] (step=0067400) Train Loss: 0.1501, Train Steps/Sec: 12.62, Grad Norm: 0.0085 +[2025-02-21 00:02:52] (step=0067500) Train Loss: 0.1527, Train Steps/Sec: 12.62, Grad Norm: 0.0088 +[2025-02-21 00:03:00] (step=0067600) Train Loss: 0.1493, Train Steps/Sec: 11.26, Grad Norm: 0.0083 +[2025-02-21 00:03:08] (step=0067700) Train Loss: 0.1495, Train Steps/Sec: 12.54, Grad Norm: 0.0083 +[2025-02-21 00:03:16] (step=0067800) Train Loss: 0.1505, Train Steps/Sec: 12.53, Grad Norm: 0.0086 +[2025-02-21 00:03:24] (step=0067900) Train Loss: 0.1493, Train Steps/Sec: 12.54, Grad Norm: 0.0083 +[2025-02-21 00:03:33] (step=0068000) Train Loss: 0.1484, Train Steps/Sec: 12.08, Grad Norm: 0.0086 +[2025-02-21 00:03:42] (step=0068100) Train Loss: 0.1505, Train Steps/Sec: 10.24, Grad Norm: 0.0084 +[2025-02-21 00:03:54] (step=0068200) Train Loss: 0.1488, Train Steps/Sec: 8.95, Grad Norm: 0.0086 +[2025-02-21 00:04:02] (step=0068300) Train Loss: 0.1493, Train Steps/Sec: 11.68, Grad Norm: 0.0087 +[2025-02-21 00:04:10] (step=0068400) Train Loss: 0.1502, Train Steps/Sec: 12.56, Grad Norm: 0.0085 +[2025-02-21 00:04:18] (step=0068500) Train Loss: 0.1494, Train Steps/Sec: 12.57, Grad Norm: 0.0083 +[2025-02-21 00:04:26] (step=0068600) Train Loss: 0.1504, Train Steps/Sec: 12.57, Grad Norm: 0.0085 +[2025-02-21 00:04:34] (step=0068700) Train Loss: 0.1509, Train Steps/Sec: 12.57, Grad Norm: 0.0083 +[2025-02-21 00:04:42] (step=0068800) Train Loss: 0.1504, Train Steps/Sec: 12.48, Grad Norm: 0.0082 +[2025-02-21 00:04:51] (step=0068900) Train Loss: 0.1504, Train Steps/Sec: 11.20, Grad Norm: 0.0083 +[2025-02-21 00:04:59] (step=0069000) Train Loss: 0.1490, Train Steps/Sec: 12.57, Grad Norm: 0.0087 +[2025-02-21 00:05:09] (step=0069100) Train Loss: 0.1505, Train Steps/Sec: 10.08, Grad Norm: 0.0082 +[2025-02-21 00:05:17] (step=0069200) Train Loss: 0.1490, Train Steps/Sec: 12.15, Grad Norm: 0.0086 +[2025-02-21 00:05:25] (step=0069300) Train Loss: 0.1500, Train Steps/Sec: 12.59, Grad Norm: 0.0089 +[2025-02-21 00:05:33] (step=0069400) Train Loss: 0.1504, Train Steps/Sec: 12.60, Grad Norm: 0.0085 +[2025-02-21 00:05:41] (step=0069500) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0084 +[2025-02-21 00:05:49] (step=0069600) Train Loss: 0.1497, Train Steps/Sec: 12.61, Grad Norm: 0.0083 +[2025-02-21 00:05:57] (step=0069700) Train Loss: 0.1487, Train Steps/Sec: 12.61, Grad Norm: 0.0083 +[2025-02-21 00:06:05] (step=0069800) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0089 +[2025-02-21 00:06:13] (step=0069900) Train Loss: 0.1502, Train Steps/Sec: 12.62, Grad Norm: 0.0082 +[2025-02-21 00:06:21] (step=0070000) Train Loss: 0.1514, Train Steps/Sec: 12.62, Grad Norm: 0.0086 +[2025-02-21 00:06:29] (step=0070100) Train Loss: 0.1483, Train Steps/Sec: 11.23, Grad Norm: 0.0082 +[2025-02-21 00:06:38] (step=0070200) Train Loss: 0.1498, Train Steps/Sec: 11.71, Grad Norm: 0.0084 +[2025-02-21 00:06:48] (step=0070300) Train Loss: 0.1496, Train Steps/Sec: 9.75, Grad Norm: 0.0085 +[2025-02-21 00:06:59] (step=0070400) Train Loss: 0.1502, Train Steps/Sec: 8.94, Grad Norm: 0.0083 +[2025-02-21 00:07:08] (step=0070500) Train Loss: 0.1502, Train Steps/Sec: 12.12, Grad Norm: 0.0082 +[2025-02-21 00:07:16] (step=0070600) Train Loss: 0.1477, Train Steps/Sec: 12.63, Grad Norm: 0.0085 +[2025-02-21 00:07:24] (step=0070700) Train Loss: 0.1504, Train Steps/Sec: 12.62, Grad Norm: 0.0082 +[2025-02-21 00:07:31] (step=0070800) Train Loss: 0.1489, Train Steps/Sec: 12.64, Grad Norm: 0.0081 +[2025-02-21 00:07:39] (step=0070900) Train Loss: 0.1490, Train Steps/Sec: 12.63, Grad Norm: 0.0083 +[2025-02-21 00:07:47] (step=0071000) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0089 +[2025-02-21 00:07:55] (step=0071100) Train Loss: 0.1487, Train Steps/Sec: 12.59, Grad Norm: 0.0081 +[2025-02-21 00:08:05] (step=0071200) Train Loss: 0.1500, Train Steps/Sec: 10.35, Grad Norm: 0.0083 +[2025-02-21 00:08:13] (step=0071300) Train Loss: 0.1492, Train Steps/Sec: 11.71, Grad Norm: 0.0085 +[2025-02-21 00:08:22] (step=0071400) Train Loss: 0.1499, Train Steps/Sec: 11.27, Grad Norm: 0.0084 +[2025-02-21 00:08:30] (step=0071500) Train Loss: 0.1504, Train Steps/Sec: 12.62, Grad Norm: 0.0085 +[2025-02-21 00:08:38] (step=0071600) Train Loss: 0.1493, Train Steps/Sec: 12.63, Grad Norm: 0.0081 +[2025-02-21 00:08:46] (step=0071700) Train Loss: 0.1490, Train Steps/Sec: 12.62, Grad Norm: 0.0086 +[2025-02-21 00:08:54] (step=0071800) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0081 +[2025-02-21 00:09:02] (step=0071900) Train Loss: 0.1502, Train Steps/Sec: 12.59, Grad Norm: 0.0082 +[2025-02-21 00:09:10] (step=0072000) Train Loss: 0.1504, Train Steps/Sec: 12.57, Grad Norm: 0.0085 +[2025-02-21 00:09:18] (step=0072100) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0083 +[2025-02-21 00:09:26] (step=0072200) Train Loss: 0.1500, Train Steps/Sec: 12.60, Grad Norm: 0.0082 +[2025-02-21 00:09:34] (step=0072300) Train Loss: 0.1506, Train Steps/Sec: 12.60, Grad Norm: 0.0084 +[2025-02-21 00:09:42] (step=0072400) Train Loss: 0.1500, Train Steps/Sec: 11.70, Grad Norm: 0.0081 +[2025-02-21 00:09:53] (step=0072500) Train Loss: 0.1490, Train Steps/Sec: 9.78, Grad Norm: 0.0083 +[2025-02-21 00:10:05] (step=0072600) Train Loss: 0.1509, Train Steps/Sec: 8.26, Grad Norm: 0.0085 +[2025-02-21 00:10:13] (step=0072700) Train Loss: 0.1515, Train Steps/Sec: 12.10, Grad Norm: 0.0082 +[2025-02-21 00:10:21] (step=0072800) Train Loss: 0.1503, Train Steps/Sec: 12.61, Grad Norm: 0.0080 +[2025-02-21 00:10:29] (step=0072900) Train Loss: 0.1485, Train Steps/Sec: 12.64, Grad Norm: 0.0083 +[2025-02-21 00:10:37] (step=0073000) Train Loss: 0.1495, Train Steps/Sec: 12.64, Grad Norm: 0.0085 +[2025-02-21 00:10:45] (step=0073100) Train Loss: 0.1493, Train Steps/Sec: 12.63, Grad Norm: 0.0079 +[2025-02-21 00:10:53] (step=0073200) Train Loss: 0.1503, Train Steps/Sec: 12.62, Grad Norm: 0.0084 +[2025-02-21 00:11:01] (step=0073300) Train Loss: 0.1501, Train Steps/Sec: 11.44, Grad Norm: 0.0085 +[2025-02-21 00:11:11] (step=0073400) Train Loss: 0.1483, Train Steps/Sec: 10.71, Grad Norm: 0.0082 +[2025-02-21 00:11:19] (step=0073500) Train Loss: 0.1490, Train Steps/Sec: 12.62, Grad Norm: 0.0084 +[2025-02-21 00:11:26] (step=0073600) Train Loss: 0.1510, Train Steps/Sec: 12.61, Grad Norm: 0.0084 +[2025-02-21 00:11:34] (step=0073700) Train Loss: 0.1498, Train Steps/Sec: 12.60, Grad Norm: 0.0083 +[2025-02-21 00:11:42] (step=0073800) Train Loss: 0.1499, Train Steps/Sec: 12.57, Grad Norm: 0.0087 +[2025-02-21 00:11:51] (step=0073900) Train Loss: 0.1497, Train Steps/Sec: 11.31, Grad Norm: 0.0081 +[2025-02-21 00:11:59] (step=0074000) Train Loss: 0.1489, Train Steps/Sec: 12.60, Grad Norm: 0.0081 +[2025-02-21 00:12:07] (step=0074100) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0087 +[2025-02-21 00:12:15] (step=0074200) Train Loss: 0.1503, Train Steps/Sec: 12.61, Grad Norm: 0.0083 +[2025-02-21 00:12:23] (step=0074300) Train Loss: 0.1497, Train Steps/Sec: 12.61, Grad Norm: 0.0085 +[2025-02-21 00:12:31] (step=0074400) Train Loss: 0.1501, Train Steps/Sec: 12.59, Grad Norm: 0.0083 +[2025-02-21 00:12:39] (step=0074500) Train Loss: 0.1494, Train Steps/Sec: 12.59, Grad Norm: 0.0082 +[2025-02-21 00:12:48] (step=0074600) Train Loss: 0.1504, Train Steps/Sec: 11.33, Grad Norm: 0.0084 +[2025-02-21 00:12:58] (step=0074700) Train Loss: 0.1512, Train Steps/Sec: 9.50, Grad Norm: 0.0079 +[2025-02-21 00:13:09] (step=0074800) Train Loss: 0.1507, Train Steps/Sec: 9.43, Grad Norm: 0.0083 +[2025-02-21 00:13:17] (step=0074900) Train Loss: 0.1501, Train Steps/Sec: 12.16, Grad Norm: 0.0083 +[2025-02-21 00:13:25] (step=0075000) Train Loss: 0.1496, Train Steps/Sec: 12.60, Grad Norm: 0.0079 +[2025-02-21 00:13:34] (step=0075100) Train Loss: 0.1489, Train Steps/Sec: 11.14, Grad Norm: 0.0080 +[2025-02-21 00:13:42] (step=0075200) Train Loss: 0.1496, Train Steps/Sec: 12.57, Grad Norm: 0.0081 +[2025-02-21 00:13:50] (step=0075300) Train Loss: 0.1497, Train Steps/Sec: 12.56, Grad Norm: 0.0082 +[2025-02-21 00:13:58] (step=0075400) Train Loss: 0.1504, Train Steps/Sec: 12.56, Grad Norm: 0.0080 +[2025-02-21 00:14:08] (step=0075500) Train Loss: 0.1478, Train Steps/Sec: 10.10, Grad Norm: 0.0083 +[2025-02-21 00:14:16] (step=0075600) Train Loss: 0.1492, Train Steps/Sec: 12.11, Grad Norm: 0.0081 +[2025-02-21 00:14:24] (step=0075700) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0084 +[2025-02-21 00:14:32] (step=0075800) Train Loss: 0.1490, Train Steps/Sec: 12.60, Grad Norm: 0.0083 +[2025-02-21 00:14:40] (step=0075900) Train Loss: 0.1505, Train Steps/Sec: 12.60, Grad Norm: 0.0085 +[2025-02-21 00:14:48] (step=0076000) Train Loss: 0.1496, Train Steps/Sec: 12.62, Grad Norm: 0.0084 +[2025-02-21 00:14:56] (step=0076100) Train Loss: 0.1508, Train Steps/Sec: 12.60, Grad Norm: 0.0081 +[2025-02-21 00:15:04] (step=0076200) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0084 +[2025-02-21 00:15:12] (step=0076300) Train Loss: 0.1501, Train Steps/Sec: 12.15, Grad Norm: 0.0080 +[2025-02-21 00:15:21] (step=0076400) Train Loss: 0.1491, Train Steps/Sec: 11.23, Grad Norm: 0.0081 +[2025-02-21 00:15:29] (step=0076500) Train Loss: 0.1493, Train Steps/Sec: 12.56, Grad Norm: 0.0083 +[2025-02-21 00:15:37] (step=0076600) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0080 +[2025-02-21 00:15:45] (step=0076700) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:15:53] (step=0076800) Train Loss: 0.1501, Train Steps/Sec: 11.31, Grad Norm: 0.0082 +[2025-02-21 00:16:05] (step=0076900) Train Loss: 0.1502, Train Steps/Sec: 8.96, Grad Norm: 0.0082 +[2025-02-21 00:16:15] (step=0077000) Train Loss: 0.1486, Train Steps/Sec: 10.01, Grad Norm: 0.0080 +[2025-02-21 00:16:23] (step=0077100) Train Loss: 0.1487, Train Steps/Sec: 12.16, Grad Norm: 0.0080 +[2025-02-21 00:16:31] (step=0077200) Train Loss: 0.1487, Train Steps/Sec: 12.63, Grad Norm: 0.0083 +[2025-02-21 00:16:39] (step=0077300) Train Loss: 0.1493, Train Steps/Sec: 12.62, Grad Norm: 0.0082 +[2025-02-21 00:16:47] (step=0077400) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0084 +[2025-02-21 00:16:54] (step=0077500) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0078 +[2025-02-21 00:17:05] (step=0077600) Train Loss: 0.1499, Train Steps/Sec: 9.48, Grad Norm: 0.0079 +[2025-02-21 00:17:14] (step=0077700) Train Loss: 0.1491, Train Steps/Sec: 11.77, Grad Norm: 0.0079 +[2025-02-21 00:17:21] (step=0077800) Train Loss: 0.1495, Train Steps/Sec: 12.60, Grad Norm: 0.0083 +[2025-02-21 00:17:29] (step=0077900) Train Loss: 0.1502, Train Steps/Sec: 12.61, Grad Norm: 0.0080 +[2025-02-21 00:17:37] (step=0078000) Train Loss: 0.1499, Train Steps/Sec: 12.60, Grad Norm: 0.0079 +[2025-02-21 00:17:45] (step=0078100) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0081 +[2025-02-21 00:17:53] (step=0078200) Train Loss: 0.1484, Train Steps/Sec: 12.59, Grad Norm: 0.0081 +[2025-02-21 00:18:01] (step=0078300) Train Loss: 0.1497, Train Steps/Sec: 12.58, Grad Norm: 0.0080 +[2025-02-21 00:18:09] (step=0078400) Train Loss: 0.1500, Train Steps/Sec: 12.59, Grad Norm: 0.0084 +[2025-02-21 00:18:17] (step=0078500) Train Loss: 0.1508, Train Steps/Sec: 12.60, Grad Norm: 0.0082 +[2025-02-21 00:18:25] (step=0078600) Train Loss: 0.1504, Train Steps/Sec: 12.58, Grad Norm: 0.0082 +[2025-02-21 00:18:33] (step=0078700) Train Loss: 0.1494, Train Steps/Sec: 12.57, Grad Norm: 0.0081 +[2025-02-21 00:18:41] (step=0078800) Train Loss: 0.1496, Train Steps/Sec: 12.53, Grad Norm: 0.0078 +[2025-02-21 00:18:50] (step=0078900) Train Loss: 0.1504, Train Steps/Sec: 10.88, Grad Norm: 0.0081 +[2025-02-21 00:18:59] (step=0079000) Train Loss: 0.1481, Train Steps/Sec: 11.34, Grad Norm: 0.0079 +[2025-02-21 00:19:11] (step=0079100) Train Loss: 0.1480, Train Steps/Sec: 8.54, Grad Norm: 0.0080 +[2025-02-21 00:19:20] (step=0079200) Train Loss: 0.1504, Train Steps/Sec: 10.31, Grad Norm: 0.0079 +[2025-02-21 00:19:29] (step=0079300) Train Loss: 0.1499, Train Steps/Sec: 12.14, Grad Norm: 0.0084 +[2025-02-21 00:19:37] (step=0079400) Train Loss: 0.1489, Train Steps/Sec: 12.62, Grad Norm: 0.0080 +[2025-02-21 00:19:44] (step=0079500) Train Loss: 0.1498, Train Steps/Sec: 12.63, Grad Norm: 0.0077 +[2025-02-21 00:19:52] (step=0079600) Train Loss: 0.1507, Train Steps/Sec: 12.61, Grad Norm: 0.0081 +[2025-02-21 00:20:01] (step=0079700) Train Loss: 0.1484, Train Steps/Sec: 12.21, Grad Norm: 0.0081 +[2025-02-21 00:20:10] (step=0079800) Train Loss: 0.1501, Train Steps/Sec: 10.11, Grad Norm: 0.0079 +[2025-02-21 00:20:18] (step=0079900) Train Loss: 0.1491, Train Steps/Sec: 12.62, Grad Norm: 0.0080 +[2025-02-21 00:20:26] (step=0080000) Train Loss: 0.1491, Train Steps/Sec: 12.60, Grad Norm: 0.0080 +[2025-02-21 00:20:29] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0080000.pt +[2025-02-21 00:20:38] (step=0080100) Train Loss: 0.1492, Train Steps/Sec: 8.35, Grad Norm: 0.0078 +[2025-02-21 00:20:46] (step=0080200) Train Loss: 0.1506, Train Steps/Sec: 12.59, Grad Norm: 0.0078 +[2025-02-21 00:20:54] (step=0080300) Train Loss: 0.1497, Train Steps/Sec: 12.61, Grad Norm: 0.0082 +[2025-02-21 00:21:02] (step=0080400) Train Loss: 0.1497, Train Steps/Sec: 12.58, Grad Norm: 0.0080 +[2025-02-21 00:21:10] (step=0080500) Train Loss: 0.1494, Train Steps/Sec: 12.59, Grad Norm: 0.0080 +[2025-02-21 00:21:18] (step=0080600) Train Loss: 0.1509, Train Steps/Sec: 12.62, Grad Norm: 0.0084 +[2025-02-21 00:21:26] (step=0080700) Train Loss: 0.1496, Train Steps/Sec: 12.60, Grad Norm: 0.0080 +[2025-02-21 00:21:34] (step=0080800) Train Loss: 0.1498, Train Steps/Sec: 12.59, Grad Norm: 0.0080 +[2025-02-21 00:21:42] (step=0080900) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0082 +[2025-02-21 00:21:50] (step=0081000) Train Loss: 0.1500, Train Steps/Sec: 12.61, Grad Norm: 0.0081 +[2025-02-21 00:21:58] (step=0081100) Train Loss: 0.1485, Train Steps/Sec: 12.17, Grad Norm: 0.0078 +[2025-02-21 00:22:06] (step=0081200) Train Loss: 0.1495, Train Steps/Sec: 11.75, Grad Norm: 0.0077 +[2025-02-21 00:22:19] (step=0081300) Train Loss: 0.1490, Train Steps/Sec: 8.30, Grad Norm: 0.0080 +[2025-02-21 00:22:29] (step=0081400) Train Loss: 0.1487, Train Steps/Sec: 9.39, Grad Norm: 0.0078 +[2025-02-21 00:22:37] (step=0081500) Train Loss: 0.1504, Train Steps/Sec: 12.16, Grad Norm: 0.0080 +[2025-02-21 00:22:45] (step=0081600) Train Loss: 0.1502, Train Steps/Sec: 12.64, Grad Norm: 0.0078 +[2025-02-21 00:22:53] (step=0081700) Train Loss: 0.1499, Train Steps/Sec: 12.61, Grad Norm: 0.0076 +[2025-02-21 00:23:01] (step=0081800) Train Loss: 0.1495, Train Steps/Sec: 12.62, Grad Norm: 0.0080 +[2025-02-21 00:23:11] (step=0081900) Train Loss: 0.1502, Train Steps/Sec: 10.11, Grad Norm: 0.0080 +[2025-02-21 00:23:19] (step=0082000) Train Loss: 0.1496, Train Steps/Sec: 12.22, Grad Norm: 0.0080 +[2025-02-21 00:23:27] (step=0082100) Train Loss: 0.1503, Train Steps/Sec: 12.64, Grad Norm: 0.0082 +[2025-02-21 00:23:35] (step=0082200) Train Loss: 0.1490, Train Steps/Sec: 12.59, Grad Norm: 0.0079 +[2025-02-21 00:23:43] (step=0082300) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0077 +[2025-02-21 00:23:51] (step=0082400) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:23:59] (step=0082500) Train Loss: 0.1499, Train Steps/Sec: 12.61, Grad Norm: 0.0077 +[2025-02-21 00:24:08] (step=0082600) Train Loss: 0.1500, Train Steps/Sec: 11.30, Grad Norm: 0.0081 +[2025-02-21 00:24:16] (step=0082700) Train Loss: 0.1490, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:24:24] (step=0082800) Train Loss: 0.1498, Train Steps/Sec: 12.60, Grad Norm: 0.0080 +[2025-02-21 00:24:32] (step=0082900) Train Loss: 0.1502, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:24:39] (step=0083000) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0077 +[2025-02-21 00:24:47] (step=0083100) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0081 +[2025-02-21 00:24:55] (step=0083200) Train Loss: 0.1491, Train Steps/Sec: 12.62, Grad Norm: 0.0078 +[2025-02-21 00:25:04] (step=0083300) Train Loss: 0.1495, Train Steps/Sec: 12.15, Grad Norm: 0.0079 +[2025-02-21 00:25:13] (step=0083400) Train Loss: 0.1492, Train Steps/Sec: 10.37, Grad Norm: 0.0079 +[2025-02-21 00:25:24] (step=0083500) Train Loss: 0.1494, Train Steps/Sec: 9.14, Grad Norm: 0.0083 +[2025-02-21 00:25:34] (step=0083600) Train Loss: 0.1482, Train Steps/Sec: 10.64, Grad Norm: 0.0079 +[2025-02-21 00:25:42] (step=0083700) Train Loss: 0.1501, Train Steps/Sec: 12.15, Grad Norm: 0.0078 +[2025-02-21 00:25:50] (step=0083800) Train Loss: 0.1490, Train Steps/Sec: 12.56, Grad Norm: 0.0082 +[2025-02-21 00:25:59] (step=0083900) Train Loss: 0.1501, Train Steps/Sec: 11.28, Grad Norm: 0.0078 +[2025-02-21 00:26:08] (step=0084000) Train Loss: 0.1493, Train Steps/Sec: 10.96, Grad Norm: 0.0079 +[2025-02-21 00:26:17] (step=0084100) Train Loss: 0.1500, Train Steps/Sec: 11.09, Grad Norm: 0.0077 +[2025-02-21 00:26:25] (step=0084200) Train Loss: 0.1501, Train Steps/Sec: 12.61, Grad Norm: 0.0079 +[2025-02-21 00:26:33] (step=0084300) Train Loss: 0.1499, Train Steps/Sec: 12.61, Grad Norm: 0.0081 +[2025-02-21 00:26:41] (step=0084400) Train Loss: 0.1492, Train Steps/Sec: 12.62, Grad Norm: 0.0078 +[2025-02-21 00:26:48] (step=0084500) Train Loss: 0.1489, Train Steps/Sec: 12.62, Grad Norm: 0.0076 +[2025-02-21 00:26:56] (step=0084600) Train Loss: 0.1495, Train Steps/Sec: 12.63, Grad Norm: 0.0078 +[2025-02-21 00:27:04] (step=0084700) Train Loss: 0.1487, Train Steps/Sec: 12.64, Grad Norm: 0.0078 +[2025-02-21 00:27:12] (step=0084800) Train Loss: 0.1505, Train Steps/Sec: 12.63, Grad Norm: 0.0080 +[2025-02-21 00:27:20] (step=0084900) Train Loss: 0.1487, Train Steps/Sec: 12.63, Grad Norm: 0.0079 +[2025-02-21 00:27:28] (step=0085000) Train Loss: 0.1483, Train Steps/Sec: 12.63, Grad Norm: 0.0077 +[2025-02-21 00:27:37] (step=0085100) Train Loss: 0.1484, Train Steps/Sec: 11.25, Grad Norm: 0.0080 +[2025-02-21 00:27:45] (step=0085200) Train Loss: 0.1493, Train Steps/Sec: 12.60, Grad Norm: 0.0081 +[2025-02-21 00:27:53] (step=0085300) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0079 +[2025-02-21 00:28:01] (step=0085400) Train Loss: 0.1505, Train Steps/Sec: 12.60, Grad Norm: 0.0082 +[2025-02-21 00:28:09] (step=0085500) Train Loss: 0.1494, Train Steps/Sec: 11.72, Grad Norm: 0.0079 +[2025-02-21 00:28:19] (step=0085600) Train Loss: 0.1485, Train Steps/Sec: 10.05, Grad Norm: 0.0079 +[2025-02-21 00:28:30] (step=0085700) Train Loss: 0.1488, Train Steps/Sec: 9.71, Grad Norm: 0.0076 +[2025-02-21 00:28:39] (step=0085800) Train Loss: 0.1498, Train Steps/Sec: 10.33, Grad Norm: 0.0079 +[2025-02-21 00:28:47] (step=0085900) Train Loss: 0.1506, Train Steps/Sec: 12.60, Grad Norm: 0.0080 +[2025-02-21 00:28:55] (step=0086000) Train Loss: 0.1483, Train Steps/Sec: 12.58, Grad Norm: 0.0081 +[2025-02-21 00:29:03] (step=0086100) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0078 +[2025-02-21 00:29:13] (step=0086200) Train Loss: 0.1496, Train Steps/Sec: 9.86, Grad Norm: 0.0079 +[2025-02-21 00:29:21] (step=0086300) Train Loss: 0.1484, Train Steps/Sec: 12.54, Grad Norm: 0.0076 +[2025-02-21 00:29:30] (step=0086400) Train Loss: 0.1491, Train Steps/Sec: 11.31, Grad Norm: 0.0076 +[2025-02-21 00:29:38] (step=0086500) Train Loss: 0.1501, Train Steps/Sec: 12.57, Grad Norm: 0.0077 +[2025-02-21 00:29:46] (step=0086600) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:29:54] (step=0086700) Train Loss: 0.1506, Train Steps/Sec: 12.57, Grad Norm: 0.0078 +[2025-02-21 00:30:02] (step=0086800) Train Loss: 0.1490, Train Steps/Sec: 12.58, Grad Norm: 0.0077 +[2025-02-21 00:30:10] (step=0086900) Train Loss: 0.1506, Train Steps/Sec: 12.38, Grad Norm: 0.0077 +[2025-02-21 00:30:18] (step=0087000) Train Loss: 0.1507, Train Steps/Sec: 12.58, Grad Norm: 0.0078 +[2025-02-21 00:30:26] (step=0087100) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0078 +[2025-02-21 00:30:34] (step=0087200) Train Loss: 0.1501, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:30:42] (step=0087300) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0076 +[2025-02-21 00:30:50] (step=0087400) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0077 +[2025-02-21 00:30:57] (step=0087500) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0078 +[2025-02-21 00:31:06] (step=0087600) Train Loss: 0.1490, Train Steps/Sec: 11.26, Grad Norm: 0.0077 +[2025-02-21 00:31:15] (step=0087700) Train Loss: 0.1490, Train Steps/Sec: 11.63, Grad Norm: 0.0078 +[2025-02-21 00:31:26] (step=0087800) Train Loss: 0.1491, Train Steps/Sec: 9.22, Grad Norm: 0.0078 +[2025-02-21 00:31:35] (step=0087900) Train Loss: 0.1489, Train Steps/Sec: 10.60, Grad Norm: 0.0079 +[2025-02-21 00:31:45] (step=0088000) Train Loss: 0.1480, Train Steps/Sec: 10.28, Grad Norm: 0.0076 +[2025-02-21 00:31:53] (step=0088100) Train Loss: 0.1491, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:32:01] (step=0088200) Train Loss: 0.1493, Train Steps/Sec: 12.61, Grad Norm: 0.0076 +[2025-02-21 00:32:11] (step=0088300) Train Loss: 0.1487, Train Steps/Sec: 9.88, Grad Norm: 0.0078 +[2025-02-21 00:32:19] (step=0088400) Train Loss: 0.1498, Train Steps/Sec: 12.61, Grad Norm: 0.0079 +[2025-02-21 00:32:27] (step=0088500) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0082 +[2025-02-21 00:32:35] (step=0088600) Train Loss: 0.1495, Train Steps/Sec: 12.59, Grad Norm: 0.0078 +[2025-02-21 00:32:43] (step=0088700) Train Loss: 0.1493, Train Steps/Sec: 12.57, Grad Norm: 0.0080 +[2025-02-21 00:32:51] (step=0088800) Train Loss: 0.1494, Train Steps/Sec: 12.54, Grad Norm: 0.0076 +[2025-02-21 00:33:00] (step=0088900) Train Loss: 0.1485, Train Steps/Sec: 11.29, Grad Norm: 0.0076 +[2025-02-21 00:33:07] (step=0089000) Train Loss: 0.1491, Train Steps/Sec: 12.60, Grad Norm: 0.0075 +[2025-02-21 00:33:15] (step=0089100) Train Loss: 0.1507, Train Steps/Sec: 12.60, Grad Norm: 0.0077 +[2025-02-21 00:33:23] (step=0089200) Train Loss: 0.1481, Train Steps/Sec: 12.60, Grad Norm: 0.0082 +[2025-02-21 00:33:31] (step=0089300) Train Loss: 0.1494, Train Steps/Sec: 12.63, Grad Norm: 0.0078 +[2025-02-21 00:33:39] (step=0089400) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0080 +[2025-02-21 00:33:47] (step=0089500) Train Loss: 0.1506, Train Steps/Sec: 12.61, Grad Norm: 0.0079 +[2025-02-21 00:33:55] (step=0089600) Train Loss: 0.1501, Train Steps/Sec: 12.62, Grad Norm: 0.0078 +[2025-02-21 00:34:03] (step=0089700) Train Loss: 0.1501, Train Steps/Sec: 12.62, Grad Norm: 0.0079 +[2025-02-21 00:34:11] (step=0089800) Train Loss: 0.1493, Train Steps/Sec: 12.62, Grad Norm: 0.0076 +[2025-02-21 00:34:19] (step=0089900) Train Loss: 0.1487, Train Steps/Sec: 11.71, Grad Norm: 0.0078 +[2025-02-21 00:34:30] (step=0090000) Train Loss: 0.1481, Train Steps/Sec: 9.20, Grad Norm: 0.0080 +[2025-02-21 00:34:41] (step=0090100) Train Loss: 0.1484, Train Steps/Sec: 9.36, Grad Norm: 0.0077 +[2025-02-21 00:34:50] (step=0090200) Train Loss: 0.1486, Train Steps/Sec: 10.96, Grad Norm: 0.0079 +[2025-02-21 00:34:58] (step=0090300) Train Loss: 0.1487, Train Steps/Sec: 12.51, Grad Norm: 0.0074 +[2025-02-21 00:35:07] (step=0090400) Train Loss: 0.1503, Train Steps/Sec: 11.04, Grad Norm: 0.0076 +[2025-02-21 00:35:16] (step=0090500) Train Loss: 0.1480, Train Steps/Sec: 11.08, Grad Norm: 0.0074 +[2025-02-21 00:35:24] (step=0090600) Train Loss: 0.1495, Train Steps/Sec: 12.60, Grad Norm: 0.0077 +[2025-02-21 00:35:32] (step=0090700) Train Loss: 0.1493, Train Steps/Sec: 12.60, Grad Norm: 0.0077 +[2025-02-21 00:35:40] (step=0090800) Train Loss: 0.1492, Train Steps/Sec: 12.50, Grad Norm: 0.0076 +[2025-02-21 00:35:48] (step=0090900) Train Loss: 0.1494, Train Steps/Sec: 12.55, Grad Norm: 0.0075 +[2025-02-21 00:35:56] (step=0091000) Train Loss: 0.1489, Train Steps/Sec: 12.52, Grad Norm: 0.0074 +[2025-02-21 00:36:04] (step=0091100) Train Loss: 0.1492, Train Steps/Sec: 12.52, Grad Norm: 0.0078 +[2025-02-21 00:36:12] (step=0091200) Train Loss: 0.1490, Train Steps/Sec: 12.52, Grad Norm: 0.0077 +[2025-02-21 00:36:20] (step=0091300) Train Loss: 0.1502, Train Steps/Sec: 12.48, Grad Norm: 0.0077 +[2025-02-21 00:36:29] (step=0091400) Train Loss: 0.1495, Train Steps/Sec: 11.26, Grad Norm: 0.0077 +[2025-02-21 00:36:37] (step=0091500) Train Loss: 0.1484, Train Steps/Sec: 12.63, Grad Norm: 0.0078 +[2025-02-21 00:36:45] (step=0091600) Train Loss: 0.1496, Train Steps/Sec: 12.63, Grad Norm: 0.0074 +[2025-02-21 00:36:53] (step=0091700) Train Loss: 0.1498, Train Steps/Sec: 12.64, Grad Norm: 0.0077 +[2025-02-21 00:37:01] (step=0091800) Train Loss: 0.1503, Train Steps/Sec: 12.64, Grad Norm: 0.0075 +[2025-02-21 00:37:09] (step=0091900) Train Loss: 0.1489, Train Steps/Sec: 12.63, Grad Norm: 0.0082 +[2025-02-21 00:37:16] (step=0092000) Train Loss: 0.1499, Train Steps/Sec: 12.64, Grad Norm: 0.0076 +[2025-02-21 00:37:25] (step=0092100) Train Loss: 0.1497, Train Steps/Sec: 11.35, Grad Norm: 0.0074 +[2025-02-21 00:37:36] (step=0092200) Train Loss: 0.1480, Train Steps/Sec: 9.48, Grad Norm: 0.0074 +[2025-02-21 00:37:45] (step=0092300) Train Loss: 0.1487, Train Steps/Sec: 10.33, Grad Norm: 0.0075 +[2025-02-21 00:37:55] (step=0092400) Train Loss: 0.1488, Train Steps/Sec: 10.98, Grad Norm: 0.0075 +[2025-02-21 00:38:03] (step=0092500) Train Loss: 0.1471, Train Steps/Sec: 12.48, Grad Norm: 0.0078 +[2025-02-21 00:38:16] (step=0092600) Train Loss: 0.1490, Train Steps/Sec: 7.40, Grad Norm: 0.0077 +[2025-02-21 00:38:24] (step=0092700) Train Loss: 0.1488, Train Steps/Sec: 12.59, Grad Norm: 0.0075 +[2025-02-21 00:38:32] (step=0092800) Train Loss: 0.1497, Train Steps/Sec: 12.58, Grad Norm: 0.0075 +[2025-02-21 00:38:40] (step=0092900) Train Loss: 0.1497, Train Steps/Sec: 12.57, Grad Norm: 0.0075 +[2025-02-21 00:38:48] (step=0093000) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0076 +[2025-02-21 00:38:56] (step=0093100) Train Loss: 0.1500, Train Steps/Sec: 12.60, Grad Norm: 0.0076 +[2025-02-21 00:39:04] (step=0093200) Train Loss: 0.1486, Train Steps/Sec: 12.62, Grad Norm: 0.0074 +[2025-02-21 00:39:12] (step=0093300) Train Loss: 0.1494, Train Steps/Sec: 12.62, Grad Norm: 0.0074 +[2025-02-21 00:39:20] (step=0093400) Train Loss: 0.1502, Train Steps/Sec: 12.59, Grad Norm: 0.0075 +[2025-02-21 00:39:28] (step=0093500) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0075 +[2025-02-21 00:39:36] (step=0093600) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:39:43] (step=0093700) Train Loss: 0.1497, Train Steps/Sec: 12.62, Grad Norm: 0.0077 +[2025-02-21 00:39:51] (step=0093800) Train Loss: 0.1500, Train Steps/Sec: 12.55, Grad Norm: 0.0075 +[2025-02-21 00:40:00] (step=0093900) Train Loss: 0.1502, Train Steps/Sec: 11.23, Grad Norm: 0.0076 +[2025-02-21 00:40:08] (step=0094000) Train Loss: 0.1494, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:40:16] (step=0094100) Train Loss: 0.1492, Train Steps/Sec: 12.60, Grad Norm: 0.0079 +[2025-02-21 00:40:24] (step=0094200) Train Loss: 0.1506, Train Steps/Sec: 12.62, Grad Norm: 0.0077 +[2025-02-21 00:40:34] (step=0094300) Train Loss: 0.1498, Train Steps/Sec: 10.52, Grad Norm: 0.0074 +[2025-02-21 00:40:45] (step=0094400) Train Loss: 0.1491, Train Steps/Sec: 8.68, Grad Norm: 0.0075 +[2025-02-21 00:40:54] (step=0094500) Train Loss: 0.1491, Train Steps/Sec: 11.03, Grad Norm: 0.0076 +[2025-02-21 00:41:03] (step=0094600) Train Loss: 0.1489, Train Steps/Sec: 11.32, Grad Norm: 0.0075 +[2025-02-21 00:41:13] (step=0094700) Train Loss: 0.1495, Train Steps/Sec: 10.07, Grad Norm: 0.0075 +[2025-02-21 00:41:21] (step=0094800) Train Loss: 0.1492, Train Steps/Sec: 12.22, Grad Norm: 0.0077 +[2025-02-21 00:41:29] (step=0094900) Train Loss: 0.1482, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:41:37] (step=0095000) Train Loss: 0.1496, Train Steps/Sec: 12.64, Grad Norm: 0.0078 +[2025-02-21 00:41:46] (step=0095100) Train Loss: 0.1491, Train Steps/Sec: 11.21, Grad Norm: 0.0076 +[2025-02-21 00:41:54] (step=0095200) Train Loss: 0.1489, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:42:02] (step=0095300) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0075 +[2025-02-21 00:42:10] (step=0095400) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0072 +[2025-02-21 00:42:18] (step=0095500) Train Loss: 0.1496, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:42:26] (step=0095600) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0075 +[2025-02-21 00:42:34] (step=0095700) Train Loss: 0.1495, Train Steps/Sec: 12.63, Grad Norm: 0.0074 +[2025-02-21 00:42:41] (step=0095800) Train Loss: 0.1486, Train Steps/Sec: 12.62, Grad Norm: 0.0078 +[2025-02-21 00:42:49] (step=0095900) Train Loss: 0.1494, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:42:57] (step=0096000) Train Loss: 0.1494, Train Steps/Sec: 12.62, Grad Norm: 0.0074 +[2025-02-21 00:43:05] (step=0096100) Train Loss: 0.1492, Train Steps/Sec: 12.64, Grad Norm: 0.0078 +[2025-02-21 00:43:13] (step=0096200) Train Loss: 0.1496, Train Steps/Sec: 12.63, Grad Norm: 0.0073 +[2025-02-21 00:43:21] (step=0096300) Train Loss: 0.1499, Train Steps/Sec: 12.56, Grad Norm: 0.0076 +[2025-02-21 00:43:30] (step=0096400) Train Loss: 0.1491, Train Steps/Sec: 10.91, Grad Norm: 0.0076 +[2025-02-21 00:43:40] (step=0096500) Train Loss: 0.1483, Train Steps/Sec: 10.34, Grad Norm: 0.0075 +[2025-02-21 00:43:51] (step=0096600) Train Loss: 0.1489, Train Steps/Sec: 9.18, Grad Norm: 0.0075 +[2025-02-21 00:44:00] (step=0096700) Train Loss: 0.1491, Train Steps/Sec: 11.00, Grad Norm: 0.0073 +[2025-02-21 00:44:09] (step=0096800) Train Loss: 0.1494, Train Steps/Sec: 11.02, Grad Norm: 0.0078 +[2025-02-21 00:44:19] (step=0096900) Train Loss: 0.1497, Train Steps/Sec: 10.43, Grad Norm: 0.0075 +[2025-02-21 00:44:26] (step=0097000) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0075 +[2025-02-21 00:44:34] (step=0097100) Train Loss: 0.1492, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:44:42] (step=0097200) Train Loss: 0.1489, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:44:50] (step=0097300) Train Loss: 0.1483, Train Steps/Sec: 12.63, Grad Norm: 0.0077 +[2025-02-21 00:44:58] (step=0097400) Train Loss: 0.1485, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:45:06] (step=0097500) Train Loss: 0.1484, Train Steps/Sec: 12.63, Grad Norm: 0.0072 +[2025-02-21 00:45:15] (step=0097600) Train Loss: 0.1491, Train Steps/Sec: 11.23, Grad Norm: 0.0074 +[2025-02-21 00:45:23] (step=0097700) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:45:31] (step=0097800) Train Loss: 0.1497, Train Steps/Sec: 12.62, Grad Norm: 0.0075 +[2025-02-21 00:45:39] (step=0097900) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0079 +[2025-02-21 00:45:47] (step=0098000) Train Loss: 0.1496, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:45:55] (step=0098100) Train Loss: 0.1489, Train Steps/Sec: 12.64, Grad Norm: 0.0074 +[2025-02-21 00:46:03] (step=0098200) Train Loss: 0.1485, Train Steps/Sec: 12.63, Grad Norm: 0.0077 +[2025-02-21 00:46:10] (step=0098300) Train Loss: 0.1501, Train Steps/Sec: 12.62, Grad Norm: 0.0075 +[2025-02-21 00:46:18] (step=0098400) Train Loss: 0.1506, Train Steps/Sec: 12.64, Grad Norm: 0.0076 +[2025-02-21 00:46:26] (step=0098500) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0074 +[2025-02-21 00:46:35] (step=0098600) Train Loss: 0.1493, Train Steps/Sec: 12.15, Grad Norm: 0.0074 +[2025-02-21 00:46:44] (step=0098700) Train Loss: 0.1497, Train Steps/Sec: 10.07, Grad Norm: 0.0077 +[2025-02-21 00:46:55] (step=0098800) Train Loss: 0.1488, Train Steps/Sec: 9.40, Grad Norm: 0.0075 +[2025-02-21 00:47:05] (step=0098900) Train Loss: 0.1494, Train Steps/Sec: 9.72, Grad Norm: 0.0075 +[2025-02-21 00:47:16] (step=0099000) Train Loss: 0.1494, Train Steps/Sec: 9.50, Grad Norm: 0.0077 +[2025-02-21 00:47:24] (step=0099100) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0072 +[2025-02-21 00:47:32] (step=0099200) Train Loss: 0.1477, Train Steps/Sec: 12.64, Grad Norm: 0.0072 +[2025-02-21 00:47:40] (step=0099300) Train Loss: 0.1490, Train Steps/Sec: 12.62, Grad Norm: 0.0075 +[2025-02-21 00:47:48] (step=0099400) Train Loss: 0.1500, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:47:56] (step=0099500) Train Loss: 0.1493, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:48:03] (step=0099600) Train Loss: 0.1489, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:48:11] (step=0099700) Train Loss: 0.1486, Train Steps/Sec: 12.58, Grad Norm: 0.0073 +[2025-02-21 00:48:19] (step=0099800) Train Loss: 0.1498, Train Steps/Sec: 12.62, Grad Norm: 0.0075 +[2025-02-21 00:48:27] (step=0099900) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0074 +[2025-02-21 00:48:35] (step=0100000) Train Loss: 0.1507, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:48:39] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0100000.pt +[2025-02-21 00:48:48] (step=0100100) Train Loss: 0.1494, Train Steps/Sec: 7.77, Grad Norm: 0.0075 +[2025-02-21 00:48:56] (step=0100200) Train Loss: 0.1487, Train Steps/Sec: 12.59, Grad Norm: 0.0073 +[2025-02-21 00:49:04] (step=0100300) Train Loss: 0.1485, Train Steps/Sec: 12.62, Grad Norm: 0.0075 +[2025-02-21 00:49:12] (step=0100400) Train Loss: 0.1497, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:49:20] (step=0100500) Train Loss: 0.1495, Train Steps/Sec: 12.63, Grad Norm: 0.0072 +[2025-02-21 00:49:28] (step=0100600) Train Loss: 0.1486, Train Steps/Sec: 12.64, Grad Norm: 0.0074 +[2025-02-21 00:49:36] (step=0100700) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0075 +[2025-02-21 00:49:44] (step=0100800) Train Loss: 0.1479, Train Steps/Sec: 11.72, Grad Norm: 0.0073 +[2025-02-21 00:49:55] (step=0100900) Train Loss: 0.1507, Train Steps/Sec: 9.23, Grad Norm: 0.0074 +[2025-02-21 00:50:04] (step=0101000) Train Loss: 0.1489, Train Steps/Sec: 10.65, Grad Norm: 0.0076 +[2025-02-21 00:50:15] (step=0101100) Train Loss: 0.1494, Train Steps/Sec: 9.55, Grad Norm: 0.0074 +[2025-02-21 00:50:24] (step=0101200) Train Loss: 0.1490, Train Steps/Sec: 10.99, Grad Norm: 0.0074 +[2025-02-21 00:50:32] (step=0101300) Train Loss: 0.1486, Train Steps/Sec: 12.53, Grad Norm: 0.0074 +[2025-02-21 00:50:41] (step=0101400) Train Loss: 0.1503, Train Steps/Sec: 11.24, Grad Norm: 0.0077 +[2025-02-21 00:50:49] (step=0101500) Train Loss: 0.1490, Train Steps/Sec: 12.62, Grad Norm: 0.0074 +[2025-02-21 00:50:57] (step=0101600) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:51:05] (step=0101700) Train Loss: 0.1485, Train Steps/Sec: 12.63, Grad Norm: 0.0071 +[2025-02-21 00:51:13] (step=0101800) Train Loss: 0.1490, Train Steps/Sec: 12.65, Grad Norm: 0.0071 +[2025-02-21 00:51:20] (step=0101900) Train Loss: 0.1485, Train Steps/Sec: 12.64, Grad Norm: 0.0076 +[2025-02-21 00:51:28] (step=0102000) Train Loss: 0.1503, Train Steps/Sec: 12.64, Grad Norm: 0.0075 +[2025-02-21 00:51:36] (step=0102100) Train Loss: 0.1484, Train Steps/Sec: 12.64, Grad Norm: 0.0074 +[2025-02-21 00:51:44] (step=0102200) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0074 +[2025-02-21 00:51:52] (step=0102300) Train Loss: 0.1487, Train Steps/Sec: 12.63, Grad Norm: 0.0074 +[2025-02-21 00:52:00] (step=0102400) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0072 +[2025-02-21 00:52:08] (step=0102500) Train Loss: 0.1503, Train Steps/Sec: 12.65, Grad Norm: 0.0071 +[2025-02-21 00:52:17] (step=0102600) Train Loss: 0.1502, Train Steps/Sec: 11.21, Grad Norm: 0.0072 +[2025-02-21 00:52:25] (step=0102700) Train Loss: 0.1490, Train Steps/Sec: 12.55, Grad Norm: 0.0073 +[2025-02-21 00:52:33] (step=0102800) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0076 +[2025-02-21 00:52:41] (step=0102900) Train Loss: 0.1491, Train Steps/Sec: 12.14, Grad Norm: 0.0075 +[2025-02-21 00:52:50] (step=0103000) Train Loss: 0.1478, Train Steps/Sec: 11.33, Grad Norm: 0.0072 +[2025-02-21 00:53:00] (step=0103100) Train Loss: 0.1500, Train Steps/Sec: 9.44, Grad Norm: 0.0072 +[2025-02-21 00:53:10] (step=0103200) Train Loss: 0.1489, Train Steps/Sec: 10.41, Grad Norm: 0.0073 +[2025-02-21 00:53:21] (step=0103300) Train Loss: 0.1497, Train Steps/Sec: 9.34, Grad Norm: 0.0076 +[2025-02-21 00:53:29] (step=0103400) Train Loss: 0.1498, Train Steps/Sec: 11.72, Grad Norm: 0.0075 +[2025-02-21 00:53:37] (step=0103500) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0073 +[2025-02-21 00:53:45] (step=0103600) Train Loss: 0.1493, Train Steps/Sec: 12.64, Grad Norm: 0.0072 +[2025-02-21 00:53:53] (step=0103700) Train Loss: 0.1500, Train Steps/Sec: 12.63, Grad Norm: 0.0072 +[2025-02-21 00:54:01] (step=0103800) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:54:10] (step=0103900) Train Loss: 0.1497, Train Steps/Sec: 11.19, Grad Norm: 0.0073 +[2025-02-21 00:54:18] (step=0104000) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:54:26] (step=0104100) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0077 +[2025-02-21 00:54:34] (step=0104200) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:54:42] (step=0104300) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 00:54:50] (step=0104400) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:54:57] (step=0104500) Train Loss: 0.1483, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:55:05] (step=0104600) Train Loss: 0.1491, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:55:13] (step=0104700) Train Loss: 0.1498, Train Steps/Sec: 12.62, Grad Norm: 0.0074 +[2025-02-21 00:55:21] (step=0104800) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0073 +[2025-02-21 00:55:29] (step=0104900) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:55:37] (step=0105000) Train Loss: 0.1481, Train Steps/Sec: 12.58, Grad Norm: 0.0072 +[2025-02-21 00:55:46] (step=0105100) Train Loss: 0.1475, Train Steps/Sec: 10.82, Grad Norm: 0.0073 +[2025-02-21 00:55:55] (step=0105200) Train Loss: 0.1487, Train Steps/Sec: 11.29, Grad Norm: 0.0074 +[2025-02-21 00:56:06] (step=0105300) Train Loss: 0.1469, Train Steps/Sec: 9.19, Grad Norm: 0.0071 +[2025-02-21 00:56:17] (step=0105400) Train Loss: 0.1480, Train Steps/Sec: 9.27, Grad Norm: 0.0074 +[2025-02-21 00:56:26] (step=0105500) Train Loss: 0.1502, Train Steps/Sec: 10.66, Grad Norm: 0.0074 +[2025-02-21 00:56:35] (step=0105600) Train Loss: 0.1492, Train Steps/Sec: 11.69, Grad Norm: 0.0071 +[2025-02-21 00:56:43] (step=0105700) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 00:56:51] (step=0105800) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0073 +[2025-02-21 00:56:59] (step=0105900) Train Loss: 0.1487, Train Steps/Sec: 12.61, Grad Norm: 0.0078 +[2025-02-21 00:57:07] (step=0106000) Train Loss: 0.1491, Train Steps/Sec: 12.62, Grad Norm: 0.0073 +[2025-02-21 00:57:15] (step=0106100) Train Loss: 0.1493, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 00:57:22] (step=0106200) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 00:57:30] (step=0106300) Train Loss: 0.1491, Train Steps/Sec: 12.60, Grad Norm: 0.0074 +[2025-02-21 00:57:39] (step=0106400) Train Loss: 0.1492, Train Steps/Sec: 11.11, Grad Norm: 0.0073 +[2025-02-21 00:57:47] (step=0106500) Train Loss: 0.1492, Train Steps/Sec: 12.55, Grad Norm: 0.0074 +[2025-02-21 00:57:55] (step=0106600) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0072 +[2025-02-21 00:58:03] (step=0106700) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0073 +[2025-02-21 00:58:11] (step=0106800) Train Loss: 0.1489, Train Steps/Sec: 12.63, Grad Norm: 0.0075 +[2025-02-21 00:58:19] (step=0106900) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0074 +[2025-02-21 00:58:27] (step=0107000) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0073 +[2025-02-21 00:58:35] (step=0107100) Train Loss: 0.1482, Train Steps/Sec: 12.64, Grad Norm: 0.0074 +[2025-02-21 00:58:43] (step=0107200) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0071 +[2025-02-21 00:58:51] (step=0107300) Train Loss: 0.1488, Train Steps/Sec: 12.15, Grad Norm: 0.0073 +[2025-02-21 00:59:00] (step=0107400) Train Loss: 0.1498, Train Steps/Sec: 11.32, Grad Norm: 0.0072 +[2025-02-21 00:59:12] (step=0107500) Train Loss: 0.1491, Train Steps/Sec: 8.32, Grad Norm: 0.0071 +[2025-02-21 00:59:23] (step=0107600) Train Loss: 0.1497, Train Steps/Sec: 8.89, Grad Norm: 0.0073 +[2025-02-21 00:59:32] (step=0107700) Train Loss: 0.1495, Train Steps/Sec: 10.97, Grad Norm: 0.0073 +[2025-02-21 00:59:41] (step=0107800) Train Loss: 0.1484, Train Steps/Sec: 12.13, Grad Norm: 0.0072 +[2025-02-21 00:59:48] (step=0107900) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 00:59:56] (step=0108000) Train Loss: 0.1488, Train Steps/Sec: 12.58, Grad Norm: 0.0071 +[2025-02-21 01:00:04] (step=0108100) Train Loss: 0.1486, Train Steps/Sec: 12.60, Grad Norm: 0.0073 +[2025-02-21 01:00:12] (step=0108200) Train Loss: 0.1504, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 01:00:20] (step=0108300) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0073 +[2025-02-21 01:00:28] (step=0108400) Train Loss: 0.1497, Train Steps/Sec: 12.59, Grad Norm: 0.0073 +[2025-02-21 01:00:36] (step=0108500) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0073 +[2025-02-21 01:00:44] (step=0108600) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 01:00:52] (step=0108700) Train Loss: 0.1498, Train Steps/Sec: 12.58, Grad Norm: 0.0073 +[2025-02-21 01:01:00] (step=0108800) Train Loss: 0.1492, Train Steps/Sec: 12.59, Grad Norm: 0.0074 +[2025-02-21 01:01:09] (step=0108900) Train Loss: 0.1495, Train Steps/Sec: 11.15, Grad Norm: 0.0071 +[2025-02-21 01:01:17] (step=0109000) Train Loss: 0.1487, Train Steps/Sec: 12.58, Grad Norm: 0.0071 +[2025-02-21 01:01:25] (step=0109100) Train Loss: 0.1491, Train Steps/Sec: 12.59, Grad Norm: 0.0078 +[2025-02-21 01:01:33] (step=0109200) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0072 +[2025-02-21 01:01:41] (step=0109300) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0072 +[2025-02-21 01:01:49] (step=0109400) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0072 +[2025-02-21 01:01:57] (step=0109500) Train Loss: 0.1493, Train Steps/Sec: 12.13, Grad Norm: 0.0072 +[2025-02-21 01:02:07] (step=0109600) Train Loss: 0.1484, Train Steps/Sec: 10.33, Grad Norm: 0.0073 +[2025-02-21 01:02:19] (step=0109700) Train Loss: 0.1495, Train Steps/Sec: 8.16, Grad Norm: 0.0072 +[2025-02-21 01:02:28] (step=0109800) Train Loss: 0.1500, Train Steps/Sec: 11.38, Grad Norm: 0.0077 +[2025-02-21 01:02:37] (step=0109900) Train Loss: 0.1491, Train Steps/Sec: 10.62, Grad Norm: 0.0074 +[2025-02-21 01:02:45] (step=0110000) Train Loss: 0.1487, Train Steps/Sec: 12.11, Grad Norm: 0.0073 +[2025-02-21 01:02:54] (step=0110100) Train Loss: 0.1483, Train Steps/Sec: 11.20, Grad Norm: 0.0073 +[2025-02-21 01:03:02] (step=0110200) Train Loss: 0.1486, Train Steps/Sec: 12.59, Grad Norm: 0.0072 +[2025-02-21 01:03:10] (step=0110300) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0073 +[2025-02-21 01:03:18] (step=0110400) Train Loss: 0.1484, Train Steps/Sec: 12.58, Grad Norm: 0.0073 +[2025-02-21 01:03:26] (step=0110500) Train Loss: 0.1499, Train Steps/Sec: 12.58, Grad Norm: 0.0075 +[2025-02-21 01:03:34] (step=0110600) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0076 +[2025-02-21 01:03:42] (step=0110700) Train Loss: 0.1490, Train Steps/Sec: 12.58, Grad Norm: 0.0072 +[2025-02-21 01:03:50] (step=0110800) Train Loss: 0.1488, Train Steps/Sec: 12.55, Grad Norm: 0.0072 +[2025-02-21 01:03:58] (step=0110900) Train Loss: 0.1485, Train Steps/Sec: 12.58, Grad Norm: 0.0069 +[2025-02-21 01:04:06] (step=0111000) Train Loss: 0.1498, Train Steps/Sec: 12.59, Grad Norm: 0.0073 +[2025-02-21 01:04:14] (step=0111100) Train Loss: 0.1492, Train Steps/Sec: 12.57, Grad Norm: 0.0070 +[2025-02-21 01:04:22] (step=0111200) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0073 +[2025-02-21 01:04:30] (step=0111300) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:04:39] (step=0111400) Train Loss: 0.1499, Train Steps/Sec: 11.26, Grad Norm: 0.0073 +[2025-02-21 01:04:46] (step=0111500) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0072 +[2025-02-21 01:04:55] (step=0111600) Train Loss: 0.1489, Train Steps/Sec: 12.15, Grad Norm: 0.0073 +[2025-02-21 01:05:03] (step=0111700) Train Loss: 0.1498, Train Steps/Sec: 12.60, Grad Norm: 0.0073 +[2025-02-21 01:05:15] (step=0111800) Train Loss: 0.1482, Train Steps/Sec: 8.36, Grad Norm: 0.0070 +[2025-02-21 01:05:25] (step=0111900) Train Loss: 0.1499, Train Steps/Sec: 9.97, Grad Norm: 0.0070 +[2025-02-21 01:05:34] (step=0112000) Train Loss: 0.1480, Train Steps/Sec: 11.03, Grad Norm: 0.0069 +[2025-02-21 01:05:42] (step=0112100) Train Loss: 0.1490, Train Steps/Sec: 11.35, Grad Norm: 0.0074 +[2025-02-21 01:05:51] (step=0112200) Train Loss: 0.1479, Train Steps/Sec: 12.14, Grad Norm: 0.0072 +[2025-02-21 01:05:59] (step=0112300) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0072 +[2025-02-21 01:06:07] (step=0112400) Train Loss: 0.1504, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:06:15] (step=0112500) Train Loss: 0.1499, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 01:06:23] (step=0112600) Train Loss: 0.1488, Train Steps/Sec: 11.23, Grad Norm: 0.0074 +[2025-02-21 01:06:31] (step=0112700) Train Loss: 0.1477, Train Steps/Sec: 12.57, Grad Norm: 0.0071 +[2025-02-21 01:06:39] (step=0112800) Train Loss: 0.1483, Train Steps/Sec: 12.57, Grad Norm: 0.0074 +[2025-02-21 01:06:47] (step=0112900) Train Loss: 0.1487, Train Steps/Sec: 12.59, Grad Norm: 0.0073 +[2025-02-21 01:06:55] (step=0113000) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:07:03] (step=0113100) Train Loss: 0.1488, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:07:11] (step=0113200) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:07:19] (step=0113300) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:07:27] (step=0113400) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:07:35] (step=0113500) Train Loss: 0.1504, Train Steps/Sec: 12.59, Grad Norm: 0.0071 +[2025-02-21 01:07:43] (step=0113600) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:07:51] (step=0113700) Train Loss: 0.1501, Train Steps/Sec: 12.60, Grad Norm: 0.0071 +[2025-02-21 01:07:59] (step=0113800) Train Loss: 0.1478, Train Steps/Sec: 12.18, Grad Norm: 0.0071 +[2025-02-21 01:08:09] (step=0113900) Train Loss: 0.1489, Train Steps/Sec: 9.68, Grad Norm: 0.0073 +[2025-02-21 01:08:21] (step=0114000) Train Loss: 0.1496, Train Steps/Sec: 8.77, Grad Norm: 0.0071 +[2025-02-21 01:08:30] (step=0114100) Train Loss: 0.1473, Train Steps/Sec: 10.27, Grad Norm: 0.0071 +[2025-02-21 01:08:40] (step=0114200) Train Loss: 0.1495, Train Steps/Sec: 10.69, Grad Norm: 0.0071 +[2025-02-21 01:08:48] (step=0114300) Train Loss: 0.1488, Train Steps/Sec: 11.68, Grad Norm: 0.0074 +[2025-02-21 01:08:57] (step=0114400) Train Loss: 0.1480, Train Steps/Sec: 12.06, Grad Norm: 0.0071 +[2025-02-21 01:09:05] (step=0114500) Train Loss: 0.1493, Train Steps/Sec: 12.50, Grad Norm: 0.0070 +[2025-02-21 01:09:13] (step=0114600) Train Loss: 0.1488, Train Steps/Sec: 12.55, Grad Norm: 0.0073 +[2025-02-21 01:09:21] (step=0114700) Train Loss: 0.1488, Train Steps/Sec: 12.53, Grad Norm: 0.0070 +[2025-02-21 01:09:29] (step=0114800) Train Loss: 0.1481, Train Steps/Sec: 12.55, Grad Norm: 0.0075 +[2025-02-21 01:09:37] (step=0114900) Train Loss: 0.1495, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:09:45] (step=0115000) Train Loss: 0.1489, Train Steps/Sec: 12.53, Grad Norm: 0.0071 +[2025-02-21 01:09:53] (step=0115100) Train Loss: 0.1471, Train Steps/Sec: 11.19, Grad Norm: 0.0072 +[2025-02-21 01:10:01] (step=0115200) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0070 +[2025-02-21 01:10:09] (step=0115300) Train Loss: 0.1477, Train Steps/Sec: 12.56, Grad Norm: 0.0072 +[2025-02-21 01:10:17] (step=0115400) Train Loss: 0.1483, Train Steps/Sec: 12.57, Grad Norm: 0.0070 +[2025-02-21 01:10:25] (step=0115500) Train Loss: 0.1488, Train Steps/Sec: 12.59, Grad Norm: 0.0070 +[2025-02-21 01:10:33] (step=0115600) Train Loss: 0.1499, Train Steps/Sec: 12.58, Grad Norm: 0.0071 +[2025-02-21 01:10:41] (step=0115700) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:10:49] (step=0115800) Train Loss: 0.1499, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:10:57] (step=0115900) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 01:11:06] (step=0116000) Train Loss: 0.1491, Train Steps/Sec: 11.37, Grad Norm: 0.0070 +[2025-02-21 01:11:16] (step=0116100) Train Loss: 0.1487, Train Steps/Sec: 10.11, Grad Norm: 0.0073 +[2025-02-21 01:11:26] (step=0116200) Train Loss: 0.1486, Train Steps/Sec: 9.44, Grad Norm: 0.0071 +[2025-02-21 01:11:36] (step=0116300) Train Loss: 0.1492, Train Steps/Sec: 10.61, Grad Norm: 0.0070 +[2025-02-21 01:11:46] (step=0116400) Train Loss: 0.1497, Train Steps/Sec: 9.34, Grad Norm: 0.0071 +[2025-02-21 01:11:55] (step=0116500) Train Loss: 0.1500, Train Steps/Sec: 11.69, Grad Norm: 0.0071 +[2025-02-21 01:12:03] (step=0116600) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:12:11] (step=0116700) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 01:12:19] (step=0116800) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:12:27] (step=0116900) Train Loss: 0.1479, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:12:35] (step=0117000) Train Loss: 0.1491, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 01:12:43] (step=0117100) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0071 +[2025-02-21 01:12:51] (step=0117200) Train Loss: 0.1485, Train Steps/Sec: 12.59, Grad Norm: 0.0070 +[2025-02-21 01:12:59] (step=0117300) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 01:13:06] (step=0117400) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0072 +[2025-02-21 01:13:14] (step=0117500) Train Loss: 0.1493, Train Steps/Sec: 12.61, Grad Norm: 0.0072 +[2025-02-21 01:13:23] (step=0117600) Train Loss: 0.1487, Train Steps/Sec: 11.16, Grad Norm: 0.0070 +[2025-02-21 01:13:31] (step=0117700) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0069 +[2025-02-21 01:13:39] (step=0117800) Train Loss: 0.1478, Train Steps/Sec: 12.54, Grad Norm: 0.0072 +[2025-02-21 01:13:47] (step=0117900) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:13:55] (step=0118000) Train Loss: 0.1505, Train Steps/Sec: 12.60, Grad Norm: 0.0069 +[2025-02-21 01:14:03] (step=0118100) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 01:14:13] (step=0118200) Train Loss: 0.1501, Train Steps/Sec: 9.80, Grad Norm: 0.0070 +[2025-02-21 01:14:22] (step=0118300) Train Loss: 0.1496, Train Steps/Sec: 11.36, Grad Norm: 0.0071 +[2025-02-21 01:14:33] (step=0118400) Train Loss: 0.1483, Train Steps/Sec: 9.21, Grad Norm: 0.0071 +[2025-02-21 01:14:42] (step=0118500) Train Loss: 0.1486, Train Steps/Sec: 11.30, Grad Norm: 0.0074 +[2025-02-21 01:14:51] (step=0118600) Train Loss: 0.1495, Train Steps/Sec: 10.70, Grad Norm: 0.0069 +[2025-02-21 01:15:00] (step=0118700) Train Loss: 0.1501, Train Steps/Sec: 11.32, Grad Norm: 0.0071 +[2025-02-21 01:15:08] (step=0118800) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0074 +[2025-02-21 01:15:17] (step=0118900) Train Loss: 0.1477, Train Steps/Sec: 11.14, Grad Norm: 0.0071 +[2025-02-21 01:15:25] (step=0119000) Train Loss: 0.1481, Train Steps/Sec: 12.58, Grad Norm: 0.0071 +[2025-02-21 01:15:33] (step=0119100) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:15:41] (step=0119200) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:15:49] (step=0119300) Train Loss: 0.1497, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:15:57] (step=0119400) Train Loss: 0.1484, Train Steps/Sec: 12.57, Grad Norm: 0.0072 +[2025-02-21 01:16:05] (step=0119500) Train Loss: 0.1490, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:16:13] (step=0119600) Train Loss: 0.1486, Train Steps/Sec: 12.58, Grad Norm: 0.0070 +[2025-02-21 01:16:20] (step=0119700) Train Loss: 0.1481, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:16:28] (step=0119800) Train Loss: 0.1492, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:16:36] (step=0119900) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:16:44] (step=0120000) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0070 +[2025-02-21 01:16:49] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0120000.pt +[2025-02-21 01:16:57] (step=0120100) Train Loss: 0.1476, Train Steps/Sec: 7.57, Grad Norm: 0.0070 +[2025-02-21 01:17:05] (step=0120200) Train Loss: 0.1484, Train Steps/Sec: 12.51, Grad Norm: 0.0069 +[2025-02-21 01:17:15] (step=0120300) Train Loss: 0.1485, Train Steps/Sec: 10.64, Grad Norm: 0.0072 +[2025-02-21 01:17:24] (step=0120400) Train Loss: 0.1474, Train Steps/Sec: 10.99, Grad Norm: 0.0070 +[2025-02-21 01:17:33] (step=0120500) Train Loss: 0.1488, Train Steps/Sec: 11.28, Grad Norm: 0.0071 +[2025-02-21 01:17:44] (step=0120600) Train Loss: 0.1485, Train Steps/Sec: 9.19, Grad Norm: 0.0070 +[2025-02-21 01:17:52] (step=0120700) Train Loss: 0.1492, Train Steps/Sec: 11.67, Grad Norm: 0.0072 +[2025-02-21 01:18:02] (step=0120800) Train Loss: 0.1474, Train Steps/Sec: 10.33, Grad Norm: 0.0069 +[2025-02-21 01:18:11] (step=0120900) Train Loss: 0.1494, Train Steps/Sec: 11.68, Grad Norm: 0.0071 +[2025-02-21 01:18:19] (step=0121000) Train Loss: 0.1486, Train Steps/Sec: 12.57, Grad Norm: 0.0072 +[2025-02-21 01:18:26] (step=0121100) Train Loss: 0.1491, Train Steps/Sec: 12.56, Grad Norm: 0.0068 +[2025-02-21 01:18:34] (step=0121200) Train Loss: 0.1491, Train Steps/Sec: 12.57, Grad Norm: 0.0070 +[2025-02-21 01:18:42] (step=0121300) Train Loss: 0.1483, Train Steps/Sec: 12.57, Grad Norm: 0.0069 +[2025-02-21 01:18:51] (step=0121400) Train Loss: 0.1500, Train Steps/Sec: 11.10, Grad Norm: 0.0071 +[2025-02-21 01:18:59] (step=0121500) Train Loss: 0.1483, Train Steps/Sec: 12.57, Grad Norm: 0.0072 +[2025-02-21 01:19:07] (step=0121600) Train Loss: 0.1487, Train Steps/Sec: 12.58, Grad Norm: 0.0068 +[2025-02-21 01:19:15] (step=0121700) Train Loss: 0.1486, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:19:23] (step=0121800) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:19:31] (step=0121900) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:19:39] (step=0122000) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:19:47] (step=0122100) Train Loss: 0.1500, Train Steps/Sec: 12.62, Grad Norm: 0.0071 +[2025-02-21 01:19:55] (step=0122200) Train Loss: 0.1493, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:20:03] (step=0122300) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:20:11] (step=0122400) Train Loss: 0.1488, Train Steps/Sec: 12.14, Grad Norm: 0.0074 +[2025-02-21 01:20:21] (step=0122500) Train Loss: 0.1491, Train Steps/Sec: 9.79, Grad Norm: 0.0069 +[2025-02-21 01:20:30] (step=0122600) Train Loss: 0.1483, Train Steps/Sec: 11.15, Grad Norm: 0.0069 +[2025-02-21 01:20:40] (step=0122700) Train Loss: 0.1483, Train Steps/Sec: 10.27, Grad Norm: 0.0070 +[2025-02-21 01:20:50] (step=0122800) Train Loss: 0.1487, Train Steps/Sec: 9.66, Grad Norm: 0.0072 +[2025-02-21 01:20:59] (step=0122900) Train Loss: 0.1483, Train Steps/Sec: 12.09, Grad Norm: 0.0071 +[2025-02-21 01:21:08] (step=0123000) Train Loss: 0.1494, Train Steps/Sec: 10.33, Grad Norm: 0.0068 +[2025-02-21 01:21:17] (step=0123100) Train Loss: 0.1480, Train Steps/Sec: 11.67, Grad Norm: 0.0071 +[2025-02-21 01:21:25] (step=0123200) Train Loss: 0.1494, Train Steps/Sec: 12.56, Grad Norm: 0.0071 +[2025-02-21 01:21:33] (step=0123300) Train Loss: 0.1473, Train Steps/Sec: 12.54, Grad Norm: 0.0073 +[2025-02-21 01:21:41] (step=0123400) Train Loss: 0.1489, Train Steps/Sec: 12.53, Grad Norm: 0.0071 +[2025-02-21 01:21:49] (step=0123500) Train Loss: 0.1502, Train Steps/Sec: 12.54, Grad Norm: 0.0071 +[2025-02-21 01:21:57] (step=0123600) Train Loss: 0.1490, Train Steps/Sec: 12.54, Grad Norm: 0.0071 +[2025-02-21 01:22:05] (step=0123700) Train Loss: 0.1479, Train Steps/Sec: 12.55, Grad Norm: 0.0069 +[2025-02-21 01:22:13] (step=0123800) Train Loss: 0.1499, Train Steps/Sec: 12.56, Grad Norm: 0.0071 +[2025-02-21 01:22:22] (step=0123900) Train Loss: 0.1469, Train Steps/Sec: 11.18, Grad Norm: 0.0068 +[2025-02-21 01:22:30] (step=0124000) Train Loss: 0.1474, Train Steps/Sec: 12.56, Grad Norm: 0.0071 +[2025-02-21 01:22:38] (step=0124100) Train Loss: 0.1489, Train Steps/Sec: 12.57, Grad Norm: 0.0072 +[2025-02-21 01:22:46] (step=0124200) Train Loss: 0.1482, Train Steps/Sec: 12.55, Grad Norm: 0.0071 +[2025-02-21 01:22:54] (step=0124300) Train Loss: 0.1482, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:23:01] (step=0124400) Train Loss: 0.1478, Train Steps/Sec: 12.57, Grad Norm: 0.0068 +[2025-02-21 01:23:09] (step=0124500) Train Loss: 0.1489, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:23:19] (step=0124600) Train Loss: 0.1490, Train Steps/Sec: 10.38, Grad Norm: 0.0069 +[2025-02-21 01:23:28] (step=0124700) Train Loss: 0.1474, Train Steps/Sec: 11.40, Grad Norm: 0.0068 +[2025-02-21 01:23:36] (step=0124800) Train Loss: 0.1488, Train Steps/Sec: 12.57, Grad Norm: 0.0069 +[2025-02-21 01:23:46] (step=0124900) Train Loss: 0.1493, Train Steps/Sec: 10.25, Grad Norm: 0.0072 +[2025-02-21 01:23:56] (step=0125000) Train Loss: 0.1494, Train Steps/Sec: 9.69, Grad Norm: 0.0069 +[2025-02-21 01:24:05] (step=0125100) Train Loss: 0.1494, Train Steps/Sec: 11.53, Grad Norm: 0.0070 +[2025-02-21 01:24:15] (step=0125200) Train Loss: 0.1488, Train Steps/Sec: 9.73, Grad Norm: 0.0069 +[2025-02-21 01:24:23] (step=0125300) Train Loss: 0.1478, Train Steps/Sec: 11.66, Grad Norm: 0.0070 +[2025-02-21 01:24:31] (step=0125400) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:24:39] (step=0125500) Train Loss: 0.1495, Train Steps/Sec: 12.57, Grad Norm: 0.0067 +[2025-02-21 01:24:47] (step=0125600) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:24:55] (step=0125700) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:25:03] (step=0125800) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0071 +[2025-02-21 01:25:11] (step=0125900) Train Loss: 0.1485, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:25:19] (step=0126000) Train Loss: 0.1481, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 01:25:27] (step=0126100) Train Loss: 0.1482, Train Steps/Sec: 12.57, Grad Norm: 0.0070 +[2025-02-21 01:25:35] (step=0126200) Train Loss: 0.1481, Train Steps/Sec: 12.58, Grad Norm: 0.0069 +[2025-02-21 01:25:43] (step=0126300) Train Loss: 0.1487, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:25:52] (step=0126400) Train Loss: 0.1482, Train Steps/Sec: 11.22, Grad Norm: 0.0069 +[2025-02-21 01:26:00] (step=0126500) Train Loss: 0.1499, Train Steps/Sec: 12.53, Grad Norm: 0.0069 +[2025-02-21 01:26:08] (step=0126600) Train Loss: 0.1489, Train Steps/Sec: 12.63, Grad Norm: 0.0069 +[2025-02-21 01:26:16] (step=0126700) Train Loss: 0.1488, Train Steps/Sec: 11.40, Grad Norm: 0.0068 +[2025-02-21 01:26:26] (step=0126800) Train Loss: 0.1483, Train Steps/Sec: 10.69, Grad Norm: 0.0068 +[2025-02-21 01:26:34] (step=0126900) Train Loss: 0.1496, Train Steps/Sec: 12.19, Grad Norm: 0.0070 +[2025-02-21 01:26:42] (step=0127000) Train Loss: 0.1492, Train Steps/Sec: 12.15, Grad Norm: 0.0069 +[2025-02-21 01:26:53] (step=0127100) Train Loss: 0.1496, Train Steps/Sec: 9.74, Grad Norm: 0.0070 +[2025-02-21 01:27:02] (step=0127200) Train Loss: 0.1488, Train Steps/Sec: 10.30, Grad Norm: 0.0069 +[2025-02-21 01:27:10] (step=0127300) Train Loss: 0.1488, Train Steps/Sec: 12.18, Grad Norm: 0.0068 +[2025-02-21 01:27:20] (step=0127400) Train Loss: 0.1472, Train Steps/Sec: 10.69, Grad Norm: 0.0071 +[2025-02-21 01:27:28] (step=0127500) Train Loss: 0.1489, Train Steps/Sec: 11.70, Grad Norm: 0.0070 +[2025-02-21 01:27:36] (step=0127600) Train Loss: 0.1484, Train Steps/Sec: 12.55, Grad Norm: 0.0067 +[2025-02-21 01:27:45] (step=0127700) Train Loss: 0.1487, Train Steps/Sec: 11.24, Grad Norm: 0.0071 +[2025-02-21 01:27:53] (step=0127800) Train Loss: 0.1481, Train Steps/Sec: 12.59, Grad Norm: 0.0071 +[2025-02-21 01:28:01] (step=0127900) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 01:28:09] (step=0128000) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0069 +[2025-02-21 01:28:17] (step=0128100) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0070 +[2025-02-21 01:28:25] (step=0128200) Train Loss: 0.1490, Train Steps/Sec: 12.59, Grad Norm: 0.0071 +[2025-02-21 01:28:33] (step=0128300) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:28:41] (step=0128400) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:28:49] (step=0128500) Train Loss: 0.1489, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:28:57] (step=0128600) Train Loss: 0.1493, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:29:05] (step=0128700) Train Loss: 0.1483, Train Steps/Sec: 12.58, Grad Norm: 0.0069 +[2025-02-21 01:29:13] (step=0128800) Train Loss: 0.1499, Train Steps/Sec: 11.73, Grad Norm: 0.0068 +[2025-02-21 01:29:24] (step=0128900) Train Loss: 0.1503, Train Steps/Sec: 9.43, Grad Norm: 0.0067 +[2025-02-21 01:29:32] (step=0129000) Train Loss: 0.1481, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:29:40] (step=0129100) Train Loss: 0.1476, Train Steps/Sec: 12.16, Grad Norm: 0.0069 +[2025-02-21 01:29:49] (step=0129200) Train Loss: 0.1465, Train Steps/Sec: 11.32, Grad Norm: 0.0071 +[2025-02-21 01:29:59] (step=0129300) Train Loss: 0.1483, Train Steps/Sec: 9.99, Grad Norm: 0.0070 +[2025-02-21 01:30:08] (step=0129400) Train Loss: 0.1489, Train Steps/Sec: 10.97, Grad Norm: 0.0070 +[2025-02-21 01:30:17] (step=0129500) Train Loss: 0.1495, Train Steps/Sec: 11.43, Grad Norm: 0.0069 +[2025-02-21 01:30:25] (step=0129600) Train Loss: 0.1485, Train Steps/Sec: 11.36, Grad Norm: 0.0069 +[2025-02-21 01:30:34] (step=0129700) Train Loss: 0.1470, Train Steps/Sec: 11.72, Grad Norm: 0.0068 +[2025-02-21 01:30:42] (step=0129800) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:30:50] (step=0129900) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:30:58] (step=0130000) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:31:06] (step=0130100) Train Loss: 0.1480, Train Steps/Sec: 12.56, Grad Norm: 0.0068 +[2025-02-21 01:31:15] (step=0130200) Train Loss: 0.1492, Train Steps/Sec: 11.22, Grad Norm: 0.0070 +[2025-02-21 01:31:23] (step=0130300) Train Loss: 0.1496, Train Steps/Sec: 12.56, Grad Norm: 0.0068 +[2025-02-21 01:31:31] (step=0130400) Train Loss: 0.1471, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:31:39] (step=0130500) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0070 +[2025-02-21 01:31:46] (step=0130600) Train Loss: 0.1488, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:31:54] (step=0130700) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 01:32:02] (step=0130800) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 01:32:10] (step=0130900) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 01:32:19] (step=0131000) Train Loss: 0.1474, Train Steps/Sec: 10.98, Grad Norm: 0.0070 +[2025-02-21 01:32:28] (step=0131100) Train Loss: 0.1495, Train Steps/Sec: 11.00, Grad Norm: 0.0067 +[2025-02-21 01:32:36] (step=0131200) Train Loss: 0.1481, Train Steps/Sec: 12.58, Grad Norm: 0.0071 +[2025-02-21 01:32:45] (step=0131300) Train Loss: 0.1481, Train Steps/Sec: 12.15, Grad Norm: 0.0069 +[2025-02-21 01:32:55] (step=0131400) Train Loss: 0.1495, Train Steps/Sec: 10.15, Grad Norm: 0.0067 +[2025-02-21 01:33:05] (step=0131500) Train Loss: 0.1479, Train Steps/Sec: 9.38, Grad Norm: 0.0068 +[2025-02-21 01:33:14] (step=0131600) Train Loss: 0.1484, Train Steps/Sec: 11.30, Grad Norm: 0.0069 +[2025-02-21 01:33:23] (step=0131700) Train Loss: 0.1475, Train Steps/Sec: 10.96, Grad Norm: 0.0069 +[2025-02-21 01:33:32] (step=0131800) Train Loss: 0.1481, Train Steps/Sec: 11.31, Grad Norm: 0.0067 +[2025-02-21 01:33:40] (step=0131900) Train Loss: 0.1486, Train Steps/Sec: 12.09, Grad Norm: 0.0068 +[2025-02-21 01:33:48] (step=0132000) Train Loss: 0.1489, Train Steps/Sec: 12.57, Grad Norm: 0.0068 +[2025-02-21 01:33:56] (step=0132100) Train Loss: 0.1487, Train Steps/Sec: 12.54, Grad Norm: 0.0068 +[2025-02-21 01:34:04] (step=0132200) Train Loss: 0.1477, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 01:34:12] (step=0132300) Train Loss: 0.1473, Train Steps/Sec: 12.56, Grad Norm: 0.0066 +[2025-02-21 01:34:20] (step=0132400) Train Loss: 0.1492, Train Steps/Sec: 12.57, Grad Norm: 0.0068 +[2025-02-21 01:34:28] (step=0132500) Train Loss: 0.1486, Train Steps/Sec: 12.55, Grad Norm: 0.0068 +[2025-02-21 01:34:36] (step=0132600) Train Loss: 0.1499, Train Steps/Sec: 12.53, Grad Norm: 0.0070 +[2025-02-21 01:34:45] (step=0132700) Train Loss: 0.1481, Train Steps/Sec: 11.18, Grad Norm: 0.0071 +[2025-02-21 01:34:53] (step=0132800) Train Loss: 0.1495, Train Steps/Sec: 12.58, Grad Norm: 0.0072 +[2025-02-21 01:35:01] (step=0132900) Train Loss: 0.1480, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 01:35:09] (step=0133000) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 01:35:18] (step=0133100) Train Loss: 0.1499, Train Steps/Sec: 11.32, Grad Norm: 0.0073 +[2025-02-21 01:35:27] (step=0133200) Train Loss: 0.1489, Train Steps/Sec: 10.73, Grad Norm: 0.0072 +[2025-02-21 01:35:35] (step=0133300) Train Loss: 0.1489, Train Steps/Sec: 12.57, Grad Norm: 0.0069 +[2025-02-21 01:35:43] (step=0133400) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0069 +[2025-02-21 01:35:51] (step=0133500) Train Loss: 0.1485, Train Steps/Sec: 12.16, Grad Norm: 0.0067 +[2025-02-21 01:36:00] (step=0133600) Train Loss: 0.1488, Train Steps/Sec: 11.30, Grad Norm: 0.0068 +[2025-02-21 01:36:11] (step=0133700) Train Loss: 0.1483, Train Steps/Sec: 9.41, Grad Norm: 0.0068 +[2025-02-21 01:36:19] (step=0133800) Train Loss: 0.1490, Train Steps/Sec: 11.31, Grad Norm: 0.0069 +[2025-02-21 01:36:29] (step=0133900) Train Loss: 0.1491, Train Steps/Sec: 10.20, Grad Norm: 0.0067 +[2025-02-21 01:36:38] (step=0134000) Train Loss: 0.1485, Train Steps/Sec: 10.95, Grad Norm: 0.0070 +[2025-02-21 01:36:46] (step=0134100) Train Loss: 0.1494, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:36:54] (step=0134200) Train Loss: 0.1504, Train Steps/Sec: 12.61, Grad Norm: 0.0072 +[2025-02-21 01:37:02] (step=0134300) Train Loss: 0.1489, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:37:10] (step=0134400) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:37:18] (step=0134500) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 01:37:26] (step=0134600) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:37:34] (step=0134700) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 01:37:42] (step=0134800) Train Loss: 0.1496, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:37:50] (step=0134900) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:37:58] (step=0135000) Train Loss: 0.1500, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:38:06] (step=0135100) Train Loss: 0.1481, Train Steps/Sec: 12.55, Grad Norm: 0.0069 +[2025-02-21 01:38:15] (step=0135200) Train Loss: 0.1481, Train Steps/Sec: 10.57, Grad Norm: 0.0068 +[2025-02-21 01:38:24] (step=0135300) Train Loss: 0.1482, Train Steps/Sec: 10.68, Grad Norm: 0.0067 +[2025-02-21 01:38:33] (step=0135400) Train Loss: 0.1492, Train Steps/Sec: 12.21, Grad Norm: 0.0070 +[2025-02-21 01:38:41] (step=0135500) Train Loss: 0.1498, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:38:49] (step=0135600) Train Loss: 0.1485, Train Steps/Sec: 12.15, Grad Norm: 0.0068 +[2025-02-21 01:38:57] (step=0135700) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:39:06] (step=0135800) Train Loss: 0.1486, Train Steps/Sec: 11.32, Grad Norm: 0.0069 +[2025-02-21 01:39:16] (step=0135900) Train Loss: 0.1476, Train Steps/Sec: 9.19, Grad Norm: 0.0069 +[2025-02-21 01:39:25] (step=0136000) Train Loss: 0.1485, Train Steps/Sec: 11.73, Grad Norm: 0.0067 +[2025-02-21 01:39:34] (step=0136100) Train Loss: 0.1492, Train Steps/Sec: 11.00, Grad Norm: 0.0069 +[2025-02-21 01:39:43] (step=0136200) Train Loss: 0.1494, Train Steps/Sec: 10.93, Grad Norm: 0.0068 +[2025-02-21 01:39:51] (step=0136300) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:40:00] (step=0136400) Train Loss: 0.1495, Train Steps/Sec: 11.20, Grad Norm: 0.0069 +[2025-02-21 01:40:08] (step=0136500) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 01:40:16] (step=0136600) Train Loss: 0.1496, Train Steps/Sec: 12.60, Grad Norm: 0.0070 +[2025-02-21 01:40:24] (step=0136700) Train Loss: 0.1489, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 01:40:32] (step=0136800) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:40:40] (step=0136900) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:40:48] (step=0137000) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:40:56] (step=0137100) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 01:41:04] (step=0137200) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:41:12] (step=0137300) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:41:21] (step=0137400) Train Loss: 0.1482, Train Steps/Sec: 11.04, Grad Norm: 0.0067 +[2025-02-21 01:41:30] (step=0137500) Train Loss: 0.1478, Train Steps/Sec: 11.05, Grad Norm: 0.0066 +[2025-02-21 01:41:38] (step=0137600) Train Loss: 0.1480, Train Steps/Sec: 12.57, Grad Norm: 0.0067 +[2025-02-21 01:41:46] (step=0137700) Train Loss: 0.1473, Train Steps/Sec: 11.26, Grad Norm: 0.0068 +[2025-02-21 01:41:55] (step=0137800) Train Loss: 0.1492, Train Steps/Sec: 12.19, Grad Norm: 0.0068 +[2025-02-21 01:42:03] (step=0137900) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 01:42:13] (step=0138000) Train Loss: 0.1477, Train Steps/Sec: 9.95, Grad Norm: 0.0067 +[2025-02-21 01:42:23] (step=0138100) Train Loss: 0.1494, Train Steps/Sec: 10.00, Grad Norm: 0.0069 +[2025-02-21 01:42:31] (step=0138200) Train Loss: 0.1480, Train Steps/Sec: 12.16, Grad Norm: 0.0066 +[2025-02-21 01:42:40] (step=0138300) Train Loss: 0.1491, Train Steps/Sec: 11.01, Grad Norm: 0.0068 +[2025-02-21 01:42:49] (step=0138400) Train Loss: 0.1488, Train Steps/Sec: 10.98, Grad Norm: 0.0067 +[2025-02-21 01:42:57] (step=0138500) Train Loss: 0.1489, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:43:05] (step=0138600) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0070 +[2025-02-21 01:43:13] (step=0138700) Train Loss: 0.1485, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:43:21] (step=0138800) Train Loss: 0.1469, Train Steps/Sec: 12.57, Grad Norm: 0.0066 +[2025-02-21 01:43:30] (step=0138900) Train Loss: 0.1476, Train Steps/Sec: 11.16, Grad Norm: 0.0065 +[2025-02-21 01:43:38] (step=0139000) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0071 +[2025-02-21 01:43:46] (step=0139100) Train Loss: 0.1492, Train Steps/Sec: 12.41, Grad Norm: 0.0067 +[2025-02-21 01:43:54] (step=0139200) Train Loss: 0.1476, Train Steps/Sec: 12.52, Grad Norm: 0.0070 +[2025-02-21 01:44:02] (step=0139300) Train Loss: 0.1473, Train Steps/Sec: 12.56, Grad Norm: 0.0069 +[2025-02-21 01:44:10] (step=0139400) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:44:18] (step=0139500) Train Loss: 0.1494, Train Steps/Sec: 11.38, Grad Norm: 0.0068 +[2025-02-21 01:44:28] (step=0139600) Train Loss: 0.1480, Train Steps/Sec: 10.74, Grad Norm: 0.0066 +[2025-02-21 01:44:36] (step=0139700) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0070 +[2025-02-21 01:44:44] (step=0139800) Train Loss: 0.1490, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 01:44:52] (step=0139900) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 01:45:00] (step=0140000) Train Loss: 0.1482, Train Steps/Sec: 12.17, Grad Norm: 0.0069 +[2025-02-21 01:45:04] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0140000.pt +[2025-02-21 01:45:12] (step=0140100) Train Loss: 0.1502, Train Steps/Sec: 8.37, Grad Norm: 0.0068 +[2025-02-21 01:45:23] (step=0140200) Train Loss: 0.1483, Train Steps/Sec: 8.61, Grad Norm: 0.0067 +[2025-02-21 01:45:33] (step=0140300) Train Loss: 0.1492, Train Steps/Sec: 10.17, Grad Norm: 0.0067 +[2025-02-21 01:45:41] (step=0140400) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 01:45:51] (step=0140500) Train Loss: 0.1481, Train Steps/Sec: 10.66, Grad Norm: 0.0067 +[2025-02-21 01:45:59] (step=0140600) Train Loss: 0.1494, Train Steps/Sec: 11.22, Grad Norm: 0.0069 +[2025-02-21 01:46:07] (step=0140700) Train Loss: 0.1488, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 01:46:15] (step=0140800) Train Loss: 0.1489, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 01:46:23] (step=0140900) Train Loss: 0.1479, Train Steps/Sec: 12.55, Grad Norm: 0.0068 +[2025-02-21 01:46:31] (step=0141000) Train Loss: 0.1482, Train Steps/Sec: 12.57, Grad Norm: 0.0067 +[2025-02-21 01:46:39] (step=0141100) Train Loss: 0.1488, Train Steps/Sec: 12.57, Grad Norm: 0.0069 +[2025-02-21 01:46:47] (step=0141200) Train Loss: 0.1488, Train Steps/Sec: 12.54, Grad Norm: 0.0068 +[2025-02-21 01:46:55] (step=0141300) Train Loss: 0.1488, Train Steps/Sec: 12.55, Grad Norm: 0.0067 +[2025-02-21 01:47:04] (step=0141400) Train Loss: 0.1483, Train Steps/Sec: 11.17, Grad Norm: 0.0067 +[2025-02-21 01:47:12] (step=0141500) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 01:47:20] (step=0141600) Train Loss: 0.1500, Train Steps/Sec: 12.12, Grad Norm: 0.0066 +[2025-02-21 01:47:30] (step=0141700) Train Loss: 0.1474, Train Steps/Sec: 10.36, Grad Norm: 0.0067 +[2025-02-21 01:47:38] (step=0141800) Train Loss: 0.1477, Train Steps/Sec: 12.21, Grad Norm: 0.0065 +[2025-02-21 01:47:46] (step=0141900) Train Loss: 0.1492, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:47:54] (step=0142000) Train Loss: 0.1486, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 01:48:02] (step=0142100) Train Loss: 0.1485, Train Steps/Sec: 12.11, Grad Norm: 0.0066 +[2025-02-21 01:48:10] (step=0142200) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 01:48:18] (step=0142300) Train Loss: 0.1488, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:48:29] (step=0142400) Train Loss: 0.1469, Train Steps/Sec: 9.41, Grad Norm: 0.0067 +[2025-02-21 01:48:39] (step=0142500) Train Loss: 0.1472, Train Steps/Sec: 10.28, Grad Norm: 0.0069 +[2025-02-21 01:48:47] (step=0142600) Train Loss: 0.1487, Train Steps/Sec: 12.55, Grad Norm: 0.0068 +[2025-02-21 01:48:57] (step=0142700) Train Loss: 0.1480, Train Steps/Sec: 9.56, Grad Norm: 0.0070 +[2025-02-21 01:49:06] (step=0142800) Train Loss: 0.1486, Train Steps/Sec: 11.33, Grad Norm: 0.0067 +[2025-02-21 01:49:14] (step=0142900) Train Loss: 0.1483, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 01:49:22] (step=0143000) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:49:30] (step=0143100) Train Loss: 0.1488, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 01:49:38] (step=0143200) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 01:49:45] (step=0143300) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:49:53] (step=0143400) Train Loss: 0.1489, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 01:50:01] (step=0143500) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:50:09] (step=0143600) Train Loss: 0.1492, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 01:50:17] (step=0143700) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 01:50:26] (step=0143800) Train Loss: 0.1481, Train Steps/Sec: 11.01, Grad Norm: 0.0068 +[2025-02-21 01:50:36] (step=0143900) Train Loss: 0.1491, Train Steps/Sec: 9.98, Grad Norm: 0.0068 +[2025-02-21 01:50:44] (step=0144000) Train Loss: 0.1491, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 01:50:52] (step=0144100) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 01:51:00] (step=0144200) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:51:08] (step=0144300) Train Loss: 0.1483, Train Steps/Sec: 12.13, Grad Norm: 0.0066 +[2025-02-21 01:51:16] (step=0144400) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:51:24] (step=0144500) Train Loss: 0.1471, Train Steps/Sec: 12.16, Grad Norm: 0.0066 +[2025-02-21 01:51:35] (step=0144600) Train Loss: 0.1474, Train Steps/Sec: 9.74, Grad Norm: 0.0068 +[2025-02-21 01:51:44] (step=0144700) Train Loss: 0.1477, Train Steps/Sec: 10.31, Grad Norm: 0.0067 +[2025-02-21 01:51:52] (step=0144800) Train Loss: 0.1498, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:52:02] (step=0144900) Train Loss: 0.1488, Train Steps/Sec: 10.69, Grad Norm: 0.0067 +[2025-02-21 01:52:11] (step=0145000) Train Loss: 0.1470, Train Steps/Sec: 11.33, Grad Norm: 0.0071 +[2025-02-21 01:52:18] (step=0145100) Train Loss: 0.1483, Train Steps/Sec: 12.55, Grad Norm: 0.0065 +[2025-02-21 01:52:27] (step=0145200) Train Loss: 0.1495, Train Steps/Sec: 11.28, Grad Norm: 0.0066 +[2025-02-21 01:52:35] (step=0145300) Train Loss: 0.1494, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 01:52:43] (step=0145400) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:52:51] (step=0145500) Train Loss: 0.1489, Train Steps/Sec: 12.63, Grad Norm: 0.0067 +[2025-02-21 01:52:59] (step=0145600) Train Loss: 0.1470, Train Steps/Sec: 12.64, Grad Norm: 0.0066 +[2025-02-21 01:53:07] (step=0145700) Train Loss: 0.1482, Train Steps/Sec: 12.65, Grad Norm: 0.0065 +[2025-02-21 01:53:15] (step=0145800) Train Loss: 0.1477, Train Steps/Sec: 12.65, Grad Norm: 0.0067 +[2025-02-21 01:53:24] (step=0145900) Train Loss: 0.1487, Train Steps/Sec: 11.42, Grad Norm: 0.0068 +[2025-02-21 01:53:33] (step=0146000) Train Loss: 0.1484, Train Steps/Sec: 11.08, Grad Norm: 0.0068 +[2025-02-21 01:53:41] (step=0146100) Train Loss: 0.1478, Train Steps/Sec: 12.20, Grad Norm: 0.0068 +[2025-02-21 01:53:49] (step=0146200) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 01:53:57] (step=0146300) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0071 +[2025-02-21 01:54:06] (step=0146400) Train Loss: 0.1480, Train Steps/Sec: 11.20, Grad Norm: 0.0066 +[2025-02-21 01:54:14] (step=0146500) Train Loss: 0.1489, Train Steps/Sec: 12.14, Grad Norm: 0.0067 +[2025-02-21 01:54:22] (step=0146600) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 01:54:31] (step=0146700) Train Loss: 0.1486, Train Steps/Sec: 11.33, Grad Norm: 0.0068 +[2025-02-21 01:54:41] (step=0146800) Train Loss: 0.1488, Train Steps/Sec: 9.45, Grad Norm: 0.0066 +[2025-02-21 01:54:50] (step=0146900) Train Loss: 0.1478, Train Steps/Sec: 11.34, Grad Norm: 0.0065 +[2025-02-21 01:54:58] (step=0147000) Train Loss: 0.1488, Train Steps/Sec: 12.58, Grad Norm: 0.0070 +[2025-02-21 01:55:08] (step=0147100) Train Loss: 0.1485, Train Steps/Sec: 10.29, Grad Norm: 0.0068 +[2025-02-21 01:55:16] (step=0147200) Train Loss: 0.1478, Train Steps/Sec: 11.64, Grad Norm: 0.0066 +[2025-02-21 01:55:24] (step=0147300) Train Loss: 0.1471, Train Steps/Sec: 12.55, Grad Norm: 0.0069 +[2025-02-21 01:55:32] (step=0147400) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0068 +[2025-02-21 01:55:40] (step=0147500) Train Loss: 0.1484, Train Steps/Sec: 12.55, Grad Norm: 0.0065 +[2025-02-21 01:55:48] (step=0147600) Train Loss: 0.1476, Train Steps/Sec: 12.49, Grad Norm: 0.0068 +[2025-02-21 01:55:57] (step=0147700) Train Loss: 0.1485, Train Steps/Sec: 11.26, Grad Norm: 0.0067 +[2025-02-21 01:56:05] (step=0147800) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:56:13] (step=0147900) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:56:21] (step=0148000) Train Loss: 0.1491, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 01:56:31] (step=0148100) Train Loss: 0.1480, Train Steps/Sec: 10.07, Grad Norm: 0.0066 +[2025-02-21 01:56:39] (step=0148200) Train Loss: 0.1467, Train Steps/Sec: 12.19, Grad Norm: 0.0068 +[2025-02-21 01:56:47] (step=0148300) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 01:56:55] (step=0148400) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 01:57:03] (step=0148500) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 01:57:11] (step=0148600) Train Loss: 0.1489, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 01:57:19] (step=0148700) Train Loss: 0.1473, Train Steps/Sec: 12.12, Grad Norm: 0.0066 +[2025-02-21 01:57:27] (step=0148800) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 01:57:37] (step=0148900) Train Loss: 0.1487, Train Steps/Sec: 9.65, Grad Norm: 0.0067 +[2025-02-21 01:57:47] (step=0149000) Train Loss: 0.1482, Train Steps/Sec: 9.99, Grad Norm: 0.0067 +[2025-02-21 01:57:56] (step=0149100) Train Loss: 0.1491, Train Steps/Sec: 11.33, Grad Norm: 0.0068 +[2025-02-21 01:58:04] (step=0149200) Train Loss: 0.1488, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 01:58:14] (step=0149300) Train Loss: 0.1478, Train Steps/Sec: 10.04, Grad Norm: 0.0066 +[2025-02-21 01:58:22] (step=0149400) Train Loss: 0.1475, Train Steps/Sec: 12.15, Grad Norm: 0.0066 +[2025-02-21 01:58:30] (step=0149500) Train Loss: 0.1500, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 01:58:38] (step=0149600) Train Loss: 0.1489, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:58:46] (step=0149700) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 01:58:54] (step=0149800) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 01:59:02] (step=0149900) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 01:59:10] (step=0150000) Train Loss: 0.1478, Train Steps/Sec: 12.63, Grad Norm: 0.0067 +[2025-02-21 01:59:18] (step=0150100) Train Loss: 0.1480, Train Steps/Sec: 12.57, Grad Norm: 0.0068 +[2025-02-21 01:59:28] (step=0150200) Train Loss: 0.1479, Train Steps/Sec: 9.46, Grad Norm: 0.0069 +[2025-02-21 01:59:37] (step=0150300) Train Loss: 0.1483, Train Steps/Sec: 11.76, Grad Norm: 0.0069 +[2025-02-21 01:59:45] (step=0150400) Train Loss: 0.1476, Train Steps/Sec: 12.57, Grad Norm: 0.0067 +[2025-02-21 01:59:53] (step=0150500) Train Loss: 0.1494, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:00:01] (step=0150600) Train Loss: 0.1493, Train Steps/Sec: 12.60, Grad Norm: 0.0071 +[2025-02-21 02:00:09] (step=0150700) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:00:17] (step=0150800) Train Loss: 0.1464, Train Steps/Sec: 12.11, Grad Norm: 0.0066 +[2025-02-21 02:00:25] (step=0150900) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:00:33] (step=0151000) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0067 +[2025-02-21 02:00:42] (step=0151100) Train Loss: 0.1479, Train Steps/Sec: 10.58, Grad Norm: 0.0065 +[2025-02-21 02:00:52] (step=0151200) Train Loss: 0.1482, Train Steps/Sec: 9.73, Grad Norm: 0.0072 +[2025-02-21 02:01:01] (step=0151300) Train Loss: 0.1481, Train Steps/Sec: 11.67, Grad Norm: 0.0065 +[2025-02-21 02:01:10] (step=0151400) Train Loss: 0.1469, Train Steps/Sec: 11.22, Grad Norm: 0.0066 +[2025-02-21 02:01:20] (step=0151500) Train Loss: 0.1492, Train Steps/Sec: 10.02, Grad Norm: 0.0068 +[2025-02-21 02:01:28] (step=0151600) Train Loss: 0.1482, Train Steps/Sec: 12.13, Grad Norm: 0.0065 +[2025-02-21 02:01:36] (step=0151700) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 02:01:44] (step=0151800) Train Loss: 0.1477, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 02:01:52] (step=0151900) Train Loss: 0.1480, Train Steps/Sec: 12.64, Grad Norm: 0.0067 +[2025-02-21 02:02:00] (step=0152000) Train Loss: 0.1489, Train Steps/Sec: 12.60, Grad Norm: 0.0072 +[2025-02-21 02:02:08] (step=0152100) Train Loss: 0.1485, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 02:02:16] (step=0152200) Train Loss: 0.1486, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 02:02:24] (step=0152300) Train Loss: 0.1493, Train Steps/Sec: 11.75, Grad Norm: 0.0065 +[2025-02-21 02:02:34] (step=0152400) Train Loss: 0.1490, Train Steps/Sec: 10.37, Grad Norm: 0.0068 +[2025-02-21 02:02:42] (step=0152500) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 02:02:50] (step=0152600) Train Loss: 0.1497, Train Steps/Sec: 12.52, Grad Norm: 0.0065 +[2025-02-21 02:02:59] (step=0152700) Train Loss: 0.1480, Train Steps/Sec: 11.23, Grad Norm: 0.0066 +[2025-02-21 02:03:07] (step=0152800) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:03:15] (step=0152900) Train Loss: 0.1489, Train Steps/Sec: 12.57, Grad Norm: 0.0069 +[2025-02-21 02:03:23] (step=0153000) Train Loss: 0.1489, Train Steps/Sec: 12.12, Grad Norm: 0.0066 +[2025-02-21 02:03:31] (step=0153100) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 02:03:39] (step=0153200) Train Loss: 0.1495, Train Steps/Sec: 12.57, Grad Norm: 0.0068 +[2025-02-21 02:03:49] (step=0153300) Train Loss: 0.1483, Train Steps/Sec: 9.99, Grad Norm: 0.0066 +[2025-02-21 02:03:59] (step=0153400) Train Loss: 0.1486, Train Steps/Sec: 10.32, Grad Norm: 0.0068 +[2025-02-21 02:04:07] (step=0153500) Train Loss: 0.1474, Train Steps/Sec: 12.16, Grad Norm: 0.0065 +[2025-02-21 02:04:15] (step=0153600) Train Loss: 0.1474, Train Steps/Sec: 12.15, Grad Norm: 0.0067 +[2025-02-21 02:04:25] (step=0153700) Train Loss: 0.1480, Train Steps/Sec: 9.93, Grad Norm: 0.0067 +[2025-02-21 02:04:33] (step=0153800) Train Loss: 0.1479, Train Steps/Sec: 12.57, Grad Norm: 0.0065 +[2025-02-21 02:04:42] (step=0153900) Train Loss: 0.1490, Train Steps/Sec: 11.10, Grad Norm: 0.0065 +[2025-02-21 02:04:50] (step=0154000) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:04:58] (step=0154100) Train Loss: 0.1485, Train Steps/Sec: 12.57, Grad Norm: 0.0067 +[2025-02-21 02:05:06] (step=0154200) Train Loss: 0.1480, Train Steps/Sec: 12.55, Grad Norm: 0.0067 +[2025-02-21 02:05:14] (step=0154300) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 02:05:22] (step=0154400) Train Loss: 0.1499, Train Steps/Sec: 12.58, Grad Norm: 0.0067 +[2025-02-21 02:05:31] (step=0154500) Train Loss: 0.1478, Train Steps/Sec: 10.44, Grad Norm: 0.0067 +[2025-02-21 02:05:40] (step=0154600) Train Loss: 0.1486, Train Steps/Sec: 11.77, Grad Norm: 0.0068 +[2025-02-21 02:05:48] (step=0154700) Train Loss: 0.1485, Train Steps/Sec: 12.57, Grad Norm: 0.0068 +[2025-02-21 02:05:56] (step=0154800) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:06:04] (step=0154900) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 02:06:12] (step=0155000) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0067 +[2025-02-21 02:06:20] (step=0155100) Train Loss: 0.1495, Train Steps/Sec: 12.51, Grad Norm: 0.0067 +[2025-02-21 02:06:29] (step=0155200) Train Loss: 0.1477, Train Steps/Sec: 10.82, Grad Norm: 0.0065 +[2025-02-21 02:06:37] (step=0155300) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 02:06:45] (step=0155400) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0067 +[2025-02-21 02:06:55] (step=0155500) Train Loss: 0.1480, Train Steps/Sec: 9.68, Grad Norm: 0.0067 +[2025-02-21 02:07:05] (step=0155600) Train Loss: 0.1479, Train Steps/Sec: 10.99, Grad Norm: 0.0068 +[2025-02-21 02:07:13] (step=0155700) Train Loss: 0.1472, Train Steps/Sec: 12.15, Grad Norm: 0.0064 +[2025-02-21 02:07:21] (step=0155800) Train Loss: 0.1496, Train Steps/Sec: 11.75, Grad Norm: 0.0066 +[2025-02-21 02:07:31] (step=0155900) Train Loss: 0.1483, Train Steps/Sec: 10.29, Grad Norm: 0.0066 +[2025-02-21 02:07:39] (step=0156000) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 02:07:47] (step=0156100) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:07:55] (step=0156200) Train Loss: 0.1482, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:08:03] (step=0156300) Train Loss: 0.1497, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:08:12] (step=0156400) Train Loss: 0.1469, Train Steps/Sec: 11.22, Grad Norm: 0.0065 +[2025-02-21 02:08:20] (step=0156500) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:08:29] (step=0156600) Train Loss: 0.1481, Train Steps/Sec: 10.41, Grad Norm: 0.0067 +[2025-02-21 02:08:38] (step=0156700) Train Loss: 0.1485, Train Steps/Sec: 11.80, Grad Norm: 0.0064 +[2025-02-21 02:08:46] (step=0156800) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:08:54] (step=0156900) Train Loss: 0.1478, Train Steps/Sec: 12.57, Grad Norm: 0.0066 +[2025-02-21 02:09:01] (step=0157000) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:09:09] (step=0157100) Train Loss: 0.1497, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 02:09:17] (step=0157200) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0069 +[2025-02-21 02:09:25] (step=0157300) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:09:33] (step=0157400) Train Loss: 0.1490, Train Steps/Sec: 12.15, Grad Norm: 0.0065 +[2025-02-21 02:09:41] (step=0157500) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:09:50] (step=0157600) Train Loss: 0.1485, Train Steps/Sec: 12.10, Grad Norm: 0.0068 +[2025-02-21 02:10:01] (step=0157700) Train Loss: 0.1494, Train Steps/Sec: 8.88, Grad Norm: 0.0066 +[2025-02-21 02:10:10] (step=0157800) Train Loss: 0.1488, Train Steps/Sec: 10.60, Grad Norm: 0.0066 +[2025-02-21 02:10:18] (step=0157900) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 02:10:27] (step=0158000) Train Loss: 0.1482, Train Steps/Sec: 11.72, Grad Norm: 0.0065 +[2025-02-21 02:10:37] (step=0158100) Train Loss: 0.1492, Train Steps/Sec: 10.27, Grad Norm: 0.0067 +[2025-02-21 02:10:45] (step=0158200) Train Loss: 0.1490, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:10:52] (step=0158300) Train Loss: 0.1492, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:11:00] (step=0158400) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:11:08] (step=0158500) Train Loss: 0.1490, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 02:11:16] (step=0158600) Train Loss: 0.1493, Train Steps/Sec: 12.59, Grad Norm: 0.0069 +[2025-02-21 02:11:24] (step=0158700) Train Loss: 0.1477, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 02:11:34] (step=0158800) Train Loss: 0.1478, Train Steps/Sec: 10.39, Grad Norm: 0.0067 +[2025-02-21 02:11:43] (step=0158900) Train Loss: 0.1482, Train Steps/Sec: 10.83, Grad Norm: 0.0065 +[2025-02-21 02:11:51] (step=0159000) Train Loss: 0.1488, Train Steps/Sec: 12.59, Grad Norm: 0.0067 +[2025-02-21 02:11:59] (step=0159100) Train Loss: 0.1486, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:12:07] (step=0159200) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 02:12:15] (step=0159300) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:12:23] (step=0159400) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:12:31] (step=0159500) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:12:39] (step=0159600) Train Loss: 0.1489, Train Steps/Sec: 12.13, Grad Norm: 0.0067 +[2025-02-21 02:12:47] (step=0159700) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:12:56] (step=0159800) Train Loss: 0.1479, Train Steps/Sec: 11.33, Grad Norm: 0.0066 +[2025-02-21 02:13:05] (step=0159900) Train Loss: 0.1477, Train Steps/Sec: 10.30, Grad Norm: 0.0065 +[2025-02-21 02:13:15] (step=0160000) Train Loss: 0.1475, Train Steps/Sec: 10.61, Grad Norm: 0.0063 +[2025-02-21 02:13:18] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0160000.pt +[2025-02-21 02:13:26] (step=0160100) Train Loss: 0.1478, Train Steps/Sec: 8.94, Grad Norm: 0.0063 +[2025-02-21 02:13:36] (step=0160200) Train Loss: 0.1485, Train Steps/Sec: 9.91, Grad Norm: 0.0069 +[2025-02-21 02:13:45] (step=0160300) Train Loss: 0.1489, Train Steps/Sec: 10.90, Grad Norm: 0.0068 +[2025-02-21 02:13:53] (step=0160400) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:14:01] (step=0160500) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:14:09] (step=0160600) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 02:14:17] (step=0160700) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 02:14:25] (step=0160800) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 02:14:34] (step=0160900) Train Loss: 0.1488, Train Steps/Sec: 10.72, Grad Norm: 0.0065 +[2025-02-21 02:14:43] (step=0161000) Train Loss: 0.1484, Train Steps/Sec: 11.80, Grad Norm: 0.0067 +[2025-02-21 02:14:51] (step=0161100) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 02:14:59] (step=0161200) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 02:15:07] (step=0161300) Train Loss: 0.1492, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:15:16] (step=0161400) Train Loss: 0.1483, Train Steps/Sec: 11.21, Grad Norm: 0.0065 +[2025-02-21 02:15:24] (step=0161500) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 02:15:31] (step=0161600) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:15:39] (step=0161700) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 02:15:48] (step=0161800) Train Loss: 0.1492, Train Steps/Sec: 12.14, Grad Norm: 0.0066 +[2025-02-21 02:15:56] (step=0161900) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:16:04] (step=0162000) Train Loss: 0.1479, Train Steps/Sec: 11.33, Grad Norm: 0.0065 +[2025-02-21 02:16:14] (step=0162100) Train Loss: 0.1480, Train Steps/Sec: 10.00, Grad Norm: 0.0066 +[2025-02-21 02:16:23] (step=0162200) Train Loss: 0.1495, Train Steps/Sec: 11.01, Grad Norm: 0.0066 +[2025-02-21 02:16:31] (step=0162300) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 02:16:40] (step=0162400) Train Loss: 0.1474, Train Steps/Sec: 11.01, Grad Norm: 0.0063 +[2025-02-21 02:16:50] (step=0162500) Train Loss: 0.1470, Train Steps/Sec: 10.91, Grad Norm: 0.0066 +[2025-02-21 02:16:58] (step=0162600) Train Loss: 0.1478, Train Steps/Sec: 12.53, Grad Norm: 0.0064 +[2025-02-21 02:17:07] (step=0162700) Train Loss: 0.1486, Train Steps/Sec: 11.23, Grad Norm: 0.0066 +[2025-02-21 02:17:14] (step=0162800) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:17:22] (step=0162900) Train Loss: 0.1491, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 02:17:31] (step=0163000) Train Loss: 0.1490, Train Steps/Sec: 11.03, Grad Norm: 0.0063 +[2025-02-21 02:17:40] (step=0163100) Train Loss: 0.1480, Train Steps/Sec: 11.37, Grad Norm: 0.0068 +[2025-02-21 02:17:48] (step=0163200) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:17:56] (step=0163300) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:18:04] (step=0163400) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:18:12] (step=0163500) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 02:18:20] (step=0163600) Train Loss: 0.1486, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:18:28] (step=0163700) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:18:36] (step=0163800) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:18:45] (step=0163900) Train Loss: 0.1486, Train Steps/Sec: 11.23, Grad Norm: 0.0067 +[2025-02-21 02:18:53] (step=0164000) Train Loss: 0.1479, Train Steps/Sec: 12.14, Grad Norm: 0.0066 +[2025-02-21 02:19:01] (step=0164100) Train Loss: 0.1488, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:19:10] (step=0164200) Train Loss: 0.1484, Train Steps/Sec: 10.58, Grad Norm: 0.0065 +[2025-02-21 02:19:20] (step=0164300) Train Loss: 0.1488, Train Steps/Sec: 10.03, Grad Norm: 0.0064 +[2025-02-21 02:19:29] (step=0164400) Train Loss: 0.1480, Train Steps/Sec: 11.33, Grad Norm: 0.0067 +[2025-02-21 02:19:37] (step=0164500) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:19:46] (step=0164600) Train Loss: 0.1478, Train Steps/Sec: 11.03, Grad Norm: 0.0063 +[2025-02-21 02:19:55] (step=0164700) Train Loss: 0.1473, Train Steps/Sec: 10.91, Grad Norm: 0.0065 +[2025-02-21 02:20:03] (step=0164800) Train Loss: 0.1485, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 02:20:11] (step=0164900) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 02:20:19] (step=0165000) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0066 +[2025-02-21 02:20:27] (step=0165100) Train Loss: 0.1488, Train Steps/Sec: 12.53, Grad Norm: 0.0066 +[2025-02-21 02:20:38] (step=0165200) Train Loss: 0.1478, Train Steps/Sec: 9.53, Grad Norm: 0.0065 +[2025-02-21 02:20:46] (step=0165300) Train Loss: 0.1473, Train Steps/Sec: 12.12, Grad Norm: 0.0069 +[2025-02-21 02:20:54] (step=0165400) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:21:02] (step=0165500) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:21:10] (step=0165600) Train Loss: 0.1484, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:21:18] (step=0165700) Train Loss: 0.1487, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:21:25] (step=0165800) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 02:21:33] (step=0165900) Train Loss: 0.1498, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:21:41] (step=0166000) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:21:50] (step=0166100) Train Loss: 0.1488, Train Steps/Sec: 12.13, Grad Norm: 0.0067 +[2025-02-21 02:21:58] (step=0166200) Train Loss: 0.1490, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:22:05] (step=0166300) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0068 +[2025-02-21 02:22:16] (step=0166400) Train Loss: 0.1475, Train Steps/Sec: 9.34, Grad Norm: 0.0067 +[2025-02-21 02:22:26] (step=0166500) Train Loss: 0.1483, Train Steps/Sec: 10.01, Grad Norm: 0.0064 +[2025-02-21 02:22:34] (step=0166600) Train Loss: 0.1482, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 02:22:42] (step=0166700) Train Loss: 0.1488, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:22:51] (step=0166800) Train Loss: 0.1473, Train Steps/Sec: 11.00, Grad Norm: 0.0066 +[2025-02-21 02:23:01] (step=0166900) Train Loss: 0.1479, Train Steps/Sec: 10.92, Grad Norm: 0.0068 +[2025-02-21 02:23:08] (step=0167000) Train Loss: 0.1490, Train Steps/Sec: 12.63, Grad Norm: 0.0069 +[2025-02-21 02:23:16] (step=0167100) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0068 +[2025-02-21 02:23:24] (step=0167200) Train Loss: 0.1486, Train Steps/Sec: 12.63, Grad Norm: 0.0067 +[2025-02-21 02:23:33] (step=0167300) Train Loss: 0.1476, Train Steps/Sec: 11.07, Grad Norm: 0.0062 +[2025-02-21 02:23:42] (step=0167400) Train Loss: 0.1472, Train Steps/Sec: 11.43, Grad Norm: 0.0064 +[2025-02-21 02:23:50] (step=0167500) Train Loss: 0.1485, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:23:58] (step=0167600) Train Loss: 0.1478, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 02:24:07] (step=0167700) Train Loss: 0.1485, Train Steps/Sec: 11.20, Grad Norm: 0.0065 +[2025-02-21 02:24:15] (step=0167800) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:24:23] (step=0167900) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0067 +[2025-02-21 02:24:31] (step=0168000) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:24:39] (step=0168100) Train Loss: 0.1485, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 02:24:46] (step=0168200) Train Loss: 0.1492, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 02:24:55] (step=0168300) Train Loss: 0.1483, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 02:25:03] (step=0168400) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:25:11] (step=0168500) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:25:21] (step=0168600) Train Loss: 0.1467, Train Steps/Sec: 9.70, Grad Norm: 0.0067 +[2025-02-21 02:25:30] (step=0168700) Train Loss: 0.1480, Train Steps/Sec: 10.61, Grad Norm: 0.0066 +[2025-02-21 02:25:39] (step=0168800) Train Loss: 0.1480, Train Steps/Sec: 12.12, Grad Norm: 0.0066 +[2025-02-21 02:25:48] (step=0168900) Train Loss: 0.1479, Train Steps/Sec: 10.88, Grad Norm: 0.0066 +[2025-02-21 02:25:57] (step=0169000) Train Loss: 0.1487, Train Steps/Sec: 10.99, Grad Norm: 0.0062 +[2025-02-21 02:26:06] (step=0169100) Train Loss: 0.1483, Train Steps/Sec: 11.28, Grad Norm: 0.0066 +[2025-02-21 02:26:14] (step=0169200) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 02:26:22] (step=0169300) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 02:26:30] (step=0169400) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:26:39] (step=0169500) Train Loss: 0.1474, Train Steps/Sec: 10.15, Grad Norm: 0.0065 +[2025-02-21 02:26:47] (step=0169600) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:26:55] (step=0169700) Train Loss: 0.1486, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 02:27:03] (step=0169800) Train Loss: 0.1470, Train Steps/Sec: 12.51, Grad Norm: 0.0065 +[2025-02-21 02:27:11] (step=0169900) Train Loss: 0.1495, Train Steps/Sec: 12.51, Grad Norm: 0.0065 +[2025-02-21 02:27:19] (step=0170000) Train Loss: 0.1461, Train Steps/Sec: 12.50, Grad Norm: 0.0064 +[2025-02-21 02:27:27] (step=0170100) Train Loss: 0.1481, Train Steps/Sec: 12.50, Grad Norm: 0.0064 +[2025-02-21 02:27:36] (step=0170200) Train Loss: 0.1490, Train Steps/Sec: 11.12, Grad Norm: 0.0066 +[2025-02-21 02:27:44] (step=0170300) Train Loss: 0.1472, Train Steps/Sec: 12.56, Grad Norm: 0.0066 +[2025-02-21 02:27:52] (step=0170400) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:28:00] (step=0170500) Train Loss: 0.1473, Train Steps/Sec: 12.12, Grad Norm: 0.0064 +[2025-02-21 02:28:08] (step=0170600) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:28:16] (step=0170700) Train Loss: 0.1487, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:28:27] (step=0170800) Train Loss: 0.1482, Train Steps/Sec: 9.43, Grad Norm: 0.0064 +[2025-02-21 02:28:36] (step=0170900) Train Loss: 0.1498, Train Steps/Sec: 10.63, Grad Norm: 0.0064 +[2025-02-21 02:28:45] (step=0171000) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 02:28:53] (step=0171100) Train Loss: 0.1479, Train Steps/Sec: 12.09, Grad Norm: 0.0066 +[2025-02-21 02:29:02] (step=0171200) Train Loss: 0.1472, Train Steps/Sec: 11.01, Grad Norm: 0.0063 +[2025-02-21 02:29:11] (step=0171300) Train Loss: 0.1479, Train Steps/Sec: 11.30, Grad Norm: 0.0071 +[2025-02-21 02:29:20] (step=0171400) Train Loss: 0.1476, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-21 02:29:28] (step=0171500) Train Loss: 0.1496, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:29:37] (step=0171600) Train Loss: 0.1483, Train Steps/Sec: 10.42, Grad Norm: 0.0063 +[2025-02-21 02:29:45] (step=0171700) Train Loss: 0.1486, Train Steps/Sec: 12.16, Grad Norm: 0.0064 +[2025-02-21 02:29:53] (step=0171800) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:30:01] (step=0171900) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:30:09] (step=0172000) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:30:17] (step=0172100) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:30:25] (step=0172200) Train Loss: 0.1479, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 02:30:33] (step=0172300) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0069 +[2025-02-21 02:30:41] (step=0172400) Train Loss: 0.1469, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 02:30:49] (step=0172500) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:30:57] (step=0172600) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:31:06] (step=0172700) Train Loss: 0.1474, Train Steps/Sec: 10.87, Grad Norm: 0.0063 +[2025-02-21 02:31:14] (step=0172800) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:31:23] (step=0172900) Train Loss: 0.1468, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 02:31:32] (step=0173000) Train Loss: 0.1487, Train Steps/Sec: 10.30, Grad Norm: 0.0065 +[2025-02-21 02:31:42] (step=0173100) Train Loss: 0.1480, Train Steps/Sec: 10.63, Grad Norm: 0.0066 +[2025-02-21 02:31:50] (step=0173200) Train Loss: 0.1467, Train Steps/Sec: 12.15, Grad Norm: 0.0064 +[2025-02-21 02:31:58] (step=0173300) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0065 +[2025-02-21 02:32:08] (step=0173400) Train Loss: 0.1477, Train Steps/Sec: 10.64, Grad Norm: 0.0065 +[2025-02-21 02:32:16] (step=0173500) Train Loss: 0.1487, Train Steps/Sec: 11.70, Grad Norm: 0.0064 +[2025-02-21 02:32:24] (step=0173600) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 02:32:33] (step=0173700) Train Loss: 0.1476, Train Steps/Sec: 11.09, Grad Norm: 0.0066 +[2025-02-21 02:32:42] (step=0173800) Train Loss: 0.1467, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 02:32:51] (step=0173900) Train Loss: 0.1495, Train Steps/Sec: 11.23, Grad Norm: 0.0064 +[2025-02-21 02:32:59] (step=0174000) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:33:07] (step=0174100) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:33:15] (step=0174200) Train Loss: 0.1486, Train Steps/Sec: 12.59, Grad Norm: 0.0070 +[2025-02-21 02:33:23] (step=0174300) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:33:30] (step=0174400) Train Loss: 0.1487, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 02:33:38] (step=0174500) Train Loss: 0.1486, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:33:46] (step=0174600) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 02:33:54] (step=0174700) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:34:02] (step=0174800) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:34:10] (step=0174900) Train Loss: 0.1480, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 02:34:18] (step=0175000) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:34:27] (step=0175100) Train Loss: 0.1477, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 02:34:38] (step=0175200) Train Loss: 0.1483, Train Steps/Sec: 9.17, Grad Norm: 0.0067 +[2025-02-21 02:34:47] (step=0175300) Train Loss: 0.1477, Train Steps/Sec: 10.62, Grad Norm: 0.0062 +[2025-02-21 02:34:55] (step=0175400) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:35:03] (step=0175500) Train Loss: 0.1473, Train Steps/Sec: 12.14, Grad Norm: 0.0067 +[2025-02-21 02:35:13] (step=0175600) Train Loss: 0.1460, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 02:35:21] (step=0175700) Train Loss: 0.1490, Train Steps/Sec: 11.69, Grad Norm: 0.0065 +[2025-02-21 02:35:30] (step=0175800) Train Loss: 0.1493, Train Steps/Sec: 12.16, Grad Norm: 0.0065 +[2025-02-21 02:35:39] (step=0175900) Train Loss: 0.1480, Train Steps/Sec: 10.75, Grad Norm: 0.0063 +[2025-02-21 02:35:47] (step=0176000) Train Loss: 0.1488, Train Steps/Sec: 12.16, Grad Norm: 0.0067 +[2025-02-21 02:35:55] (step=0176100) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:36:03] (step=0176200) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:36:11] (step=0176300) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:36:20] (step=0176400) Train Loss: 0.1463, Train Steps/Sec: 11.22, Grad Norm: 0.0066 +[2025-02-21 02:36:28] (step=0176500) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:36:36] (step=0176600) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:36:44] (step=0176700) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:36:52] (step=0176800) Train Loss: 0.1475, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 02:37:00] (step=0176900) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 02:37:07] (step=0177000) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 02:37:16] (step=0177100) Train Loss: 0.1485, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 02:37:24] (step=0177200) Train Loss: 0.1486, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:37:33] (step=0177300) Train Loss: 0.1474, Train Steps/Sec: 10.90, Grad Norm: 0.0064 +[2025-02-21 02:37:42] (step=0177400) Train Loss: 0.1484, Train Steps/Sec: 10.64, Grad Norm: 0.0070 +[2025-02-21 02:37:52] (step=0177500) Train Loss: 0.1478, Train Steps/Sec: 10.64, Grad Norm: 0.0065 +[2025-02-21 02:37:59] (step=0177600) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:38:09] (step=0177700) Train Loss: 0.1483, Train Steps/Sec: 10.50, Grad Norm: 0.0066 +[2025-02-21 02:38:19] (step=0177800) Train Loss: 0.1484, Train Steps/Sec: 10.29, Grad Norm: 0.0065 +[2025-02-21 02:38:27] (step=0177900) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:38:36] (step=0178000) Train Loss: 0.1476, Train Steps/Sec: 10.73, Grad Norm: 0.0063 +[2025-02-21 02:38:44] (step=0178100) Train Loss: 0.1461, Train Steps/Sec: 11.79, Grad Norm: 0.0062 +[2025-02-21 02:38:52] (step=0178200) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:39:00] (step=0178300) Train Loss: 0.1477, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 02:39:08] (step=0178400) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:39:16] (step=0178500) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:39:24] (step=0178600) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:39:32] (step=0178700) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:39:40] (step=0178800) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:39:49] (step=0178900) Train Loss: 0.1485, Train Steps/Sec: 11.24, Grad Norm: 0.0068 +[2025-02-21 02:39:57] (step=0179000) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:40:05] (step=0179100) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 02:40:13] (step=0179200) Train Loss: 0.1483, Train Steps/Sec: 12.11, Grad Norm: 0.0065 +[2025-02-21 02:40:21] (step=0179300) Train Loss: 0.1489, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:40:29] (step=0179400) Train Loss: 0.1489, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:40:39] (step=0179500) Train Loss: 0.1468, Train Steps/Sec: 10.26, Grad Norm: 0.0064 +[2025-02-21 02:40:48] (step=0179600) Train Loss: 0.1481, Train Steps/Sec: 10.34, Grad Norm: 0.0066 +[2025-02-21 02:40:57] (step=0179700) Train Loss: 0.1494, Train Steps/Sec: 11.67, Grad Norm: 0.0067 +[2025-02-21 02:41:05] (step=0179800) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 02:41:14] (step=0179900) Train Loss: 0.1473, Train Steps/Sec: 11.28, Grad Norm: 0.0064 +[2025-02-21 02:41:23] (step=0180000) Train Loss: 0.1485, Train Steps/Sec: 10.62, Grad Norm: 0.0065 +[2025-02-21 02:41:28] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0180000.pt +[2025-02-21 02:41:37] (step=0180100) Train Loss: 0.1475, Train Steps/Sec: 7.34, Grad Norm: 0.0062 +[2025-02-21 02:41:47] (step=0180200) Train Loss: 0.1488, Train Steps/Sec: 9.91, Grad Norm: 0.0063 +[2025-02-21 02:41:55] (step=0180300) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 02:42:03] (step=0180400) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:42:11] (step=0180500) Train Loss: 0.1484, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 02:42:19] (step=0180600) Train Loss: 0.1478, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 02:42:27] (step=0180700) Train Loss: 0.1466, Train Steps/Sec: 12.55, Grad Norm: 0.0066 +[2025-02-21 02:42:35] (step=0180800) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:42:42] (step=0180900) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:42:50] (step=0181000) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 02:42:58] (step=0181100) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:43:06] (step=0181200) Train Loss: 0.1483, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:43:14] (step=0181300) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:43:23] (step=0181400) Train Loss: 0.1482, Train Steps/Sec: 10.84, Grad Norm: 0.0067 +[2025-02-21 02:43:31] (step=0181500) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:43:39] (step=0181600) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:43:49] (step=0181700) Train Loss: 0.1464, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-21 02:43:59] (step=0181800) Train Loss: 0.1492, Train Steps/Sec: 10.66, Grad Norm: 0.0066 +[2025-02-21 02:44:07] (step=0181900) Train Loss: 0.1458, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 02:44:15] (step=0182000) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:44:24] (step=0182100) Train Loss: 0.1474, Train Steps/Sec: 11.34, Grad Norm: 0.0064 +[2025-02-21 02:44:34] (step=0182200) Train Loss: 0.1473, Train Steps/Sec: 10.06, Grad Norm: 0.0064 +[2025-02-21 02:44:43] (step=0182300) Train Loss: 0.1474, Train Steps/Sec: 10.73, Grad Norm: 0.0064 +[2025-02-21 02:44:51] (step=0182400) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:44:59] (step=0182500) Train Loss: 0.1491, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:45:07] (step=0182600) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 02:45:16] (step=0182700) Train Loss: 0.1473, Train Steps/Sec: 11.15, Grad Norm: 0.0063 +[2025-02-21 02:45:24] (step=0182800) Train Loss: 0.1483, Train Steps/Sec: 12.55, Grad Norm: 0.0064 +[2025-02-21 02:45:32] (step=0182900) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:45:40] (step=0183000) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 02:45:48] (step=0183100) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:45:56] (step=0183200) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 02:46:04] (step=0183300) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:46:12] (step=0183400) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:46:20] (step=0183500) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:46:28] (step=0183600) Train Loss: 0.1472, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 02:46:36] (step=0183700) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 02:46:44] (step=0183800) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:46:55] (step=0183900) Train Loss: 0.1477, Train Steps/Sec: 8.94, Grad Norm: 0.0063 +[2025-02-21 02:47:04] (step=0184000) Train Loss: 0.1481, Train Steps/Sec: 10.60, Grad Norm: 0.0064 +[2025-02-21 02:47:12] (step=0184100) Train Loss: 0.1477, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 02:47:20] (step=0184200) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0068 +[2025-02-21 02:47:30] (step=0184300) Train Loss: 0.1469, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-21 02:47:40] (step=0184400) Train Loss: 0.1475, Train Steps/Sec: 10.06, Grad Norm: 0.0064 +[2025-02-21 02:47:48] (step=0184500) Train Loss: 0.1492, Train Steps/Sec: 11.80, Grad Norm: 0.0066 +[2025-02-21 02:47:56] (step=0184600) Train Loss: 0.1482, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:48:04] (step=0184700) Train Loss: 0.1482, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 02:48:12] (step=0184800) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:48:20] (step=0184900) Train Loss: 0.1493, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 02:48:28] (step=0185000) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:48:36] (step=0185100) Train Loss: 0.1489, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:48:45] (step=0185200) Train Loss: 0.1482, Train Steps/Sec: 11.15, Grad Norm: 0.0063 +[2025-02-21 02:48:53] (step=0185300) Train Loss: 0.1485, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 02:49:00] (step=0185400) Train Loss: 0.1477, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 02:49:08] (step=0185500) Train Loss: 0.1488, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 02:49:16] (step=0185600) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 02:49:24] (step=0185700) Train Loss: 0.1486, Train Steps/Sec: 12.63, Grad Norm: 0.0067 +[2025-02-21 02:49:32] (step=0185800) Train Loss: 0.1482, Train Steps/Sec: 12.16, Grad Norm: 0.0065 +[2025-02-21 02:49:40] (step=0185900) Train Loss: 0.1490, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 02:49:48] (step=0186000) Train Loss: 0.1494, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:49:59] (step=0186100) Train Loss: 0.1485, Train Steps/Sec: 9.68, Grad Norm: 0.0067 +[2025-02-21 02:50:08] (step=0186200) Train Loss: 0.1482, Train Steps/Sec: 10.31, Grad Norm: 0.0063 +[2025-02-21 02:50:17] (step=0186300) Train Loss: 0.1480, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 02:50:26] (step=0186400) Train Loss: 0.1474, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-21 02:50:36] (step=0186500) Train Loss: 0.1489, Train Steps/Sec: 9.23, Grad Norm: 0.0064 +[2025-02-21 02:50:46] (step=0186600) Train Loss: 0.1484, Train Steps/Sec: 10.60, Grad Norm: 0.0064 +[2025-02-21 02:50:54] (step=0186700) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 02:51:02] (step=0186800) Train Loss: 0.1486, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 02:51:10] (step=0186900) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 02:51:18] (step=0187000) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 02:51:25] (step=0187100) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:51:33] (step=0187200) Train Loss: 0.1492, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 02:51:41] (step=0187300) Train Loss: 0.1481, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 02:51:49] (step=0187400) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:51:57] (step=0187500) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:52:05] (step=0187600) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:52:14] (step=0187700) Train Loss: 0.1477, Train Steps/Sec: 11.20, Grad Norm: 0.0065 +[2025-02-21 02:52:22] (step=0187800) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:52:30] (step=0187900) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:52:38] (step=0188000) Train Loss: 0.1487, Train Steps/Sec: 12.06, Grad Norm: 0.0064 +[2025-02-21 02:52:46] (step=0188100) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:52:54] (step=0188200) Train Loss: 0.1484, Train Steps/Sec: 12.09, Grad Norm: 0.0062 +[2025-02-21 02:53:05] (step=0188300) Train Loss: 0.1467, Train Steps/Sec: 9.44, Grad Norm: 0.0064 +[2025-02-21 02:53:14] (step=0188400) Train Loss: 0.1491, Train Steps/Sec: 10.61, Grad Norm: 0.0066 +[2025-02-21 02:53:22] (step=0188500) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:53:31] (step=0188600) Train Loss: 0.1472, Train Steps/Sec: 11.74, Grad Norm: 0.0064 +[2025-02-21 02:53:41] (step=0188700) Train Loss: 0.1489, Train Steps/Sec: 9.99, Grad Norm: 0.0063 +[2025-02-21 02:53:50] (step=0188800) Train Loss: 0.1478, Train Steps/Sec: 10.65, Grad Norm: 0.0064 +[2025-02-21 02:53:58] (step=0188900) Train Loss: 0.1473, Train Steps/Sec: 12.55, Grad Norm: 0.0065 +[2025-02-21 02:54:07] (step=0189000) Train Loss: 0.1478, Train Steps/Sec: 11.27, Grad Norm: 0.0067 +[2025-02-21 02:54:15] (step=0189100) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:54:23] (step=0189200) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:54:31] (step=0189300) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 02:54:39] (step=0189400) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 02:54:47] (step=0189500) Train Loss: 0.1486, Train Steps/Sec: 12.63, Grad Norm: 0.0066 +[2025-02-21 02:54:55] (step=0189600) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:55:03] (step=0189700) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:55:11] (step=0189800) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:55:18] (step=0189900) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 02:55:26] (step=0190000) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 02:55:34] (step=0190100) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:55:44] (step=0190200) Train Loss: 0.1471, Train Steps/Sec: 10.86, Grad Norm: 0.0063 +[2025-02-21 02:55:52] (step=0190300) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 02:56:00] (step=0190400) Train Loss: 0.1472, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-21 02:56:11] (step=0190500) Train Loss: 0.1469, Train Steps/Sec: 9.45, Grad Norm: 0.0064 +[2025-02-21 02:56:20] (step=0190600) Train Loss: 0.1478, Train Steps/Sec: 10.95, Grad Norm: 0.0064 +[2025-02-21 02:56:28] (step=0190700) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 02:56:37] (step=0190800) Train Loss: 0.1493, Train Steps/Sec: 10.75, Grad Norm: 0.0063 +[2025-02-21 02:56:47] (step=0190900) Train Loss: 0.1488, Train Steps/Sec: 10.32, Grad Norm: 0.0063 +[2025-02-21 02:56:56] (step=0191000) Train Loss: 0.1469, Train Steps/Sec: 11.31, Grad Norm: 0.0067 +[2025-02-21 02:57:04] (step=0191100) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 02:57:11] (step=0191200) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 02:57:19] (step=0191300) Train Loss: 0.1488, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 02:57:27] (step=0191400) Train Loss: 0.1489, Train Steps/Sec: 12.55, Grad Norm: 0.0066 +[2025-02-21 02:57:36] (step=0191500) Train Loss: 0.1489, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-21 02:57:44] (step=0191600) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 02:57:52] (step=0191700) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 02:58:00] (step=0191800) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 02:58:08] (step=0191900) Train Loss: 0.1495, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 02:58:16] (step=0192000) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 02:58:24] (step=0192100) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 02:58:32] (step=0192200) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 02:58:40] (step=0192300) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 02:58:48] (step=0192400) Train Loss: 0.1471, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 02:58:56] (step=0192500) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 02:59:05] (step=0192600) Train Loss: 0.1477, Train Steps/Sec: 10.58, Grad Norm: 0.0063 +[2025-02-21 02:59:16] (step=0192700) Train Loss: 0.1470, Train Steps/Sec: 9.46, Grad Norm: 0.0062 +[2025-02-21 02:59:25] (step=0192800) Train Loss: 0.1460, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 02:59:34] (step=0192900) Train Loss: 0.1480, Train Steps/Sec: 11.05, Grad Norm: 0.0064 +[2025-02-21 02:59:43] (step=0193000) Train Loss: 0.1473, Train Steps/Sec: 11.45, Grad Norm: 0.0065 +[2025-02-21 02:59:52] (step=0193100) Train Loss: 0.1487, Train Steps/Sec: 10.65, Grad Norm: 0.0067 +[2025-02-21 03:00:01] (step=0193200) Train Loss: 0.1468, Train Steps/Sec: 11.31, Grad Norm: 0.0066 +[2025-02-21 03:00:09] (step=0193300) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:00:17] (step=0193400) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 03:00:25] (step=0193500) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 03:00:33] (step=0193600) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:00:41] (step=0193700) Train Loss: 0.1494, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:00:49] (step=0193800) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:00:57] (step=0193900) Train Loss: 0.1476, Train Steps/Sec: 12.54, Grad Norm: 0.0064 +[2025-02-21 03:01:06] (step=0194000) Train Loss: 0.1484, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-21 03:01:14] (step=0194100) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 03:01:21] (step=0194200) Train Loss: 0.1485, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:01:29] (step=0194300) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:01:37] (step=0194400) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:01:46] (step=0194500) Train Loss: 0.1481, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 03:01:53] (step=0194600) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 03:02:01] (step=0194700) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:02:11] (step=0194800) Train Loss: 0.1486, Train Steps/Sec: 10.28, Grad Norm: 0.0064 +[2025-02-21 03:02:21] (step=0194900) Train Loss: 0.1480, Train Steps/Sec: 10.66, Grad Norm: 0.0063 +[2025-02-21 03:02:30] (step=0195000) Train Loss: 0.1484, Train Steps/Sec: 10.61, Grad Norm: 0.0065 +[2025-02-21 03:02:40] (step=0195100) Train Loss: 0.1472, Train Steps/Sec: 10.38, Grad Norm: 0.0065 +[2025-02-21 03:02:49] (step=0195200) Train Loss: 0.1463, Train Steps/Sec: 10.53, Grad Norm: 0.0063 +[2025-02-21 03:02:59] (step=0195300) Train Loss: 0.1479, Train Steps/Sec: 10.63, Grad Norm: 0.0064 +[2025-02-21 03:03:07] (step=0195400) Train Loss: 0.1474, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 03:03:15] (step=0195500) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 03:03:23] (step=0195600) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:03:31] (step=0195700) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 03:03:39] (step=0195800) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:03:47] (step=0195900) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:03:55] (step=0196000) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 03:04:03] (step=0196100) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:04:11] (step=0196200) Train Loss: 0.1483, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 03:04:18] (step=0196300) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:04:26] (step=0196400) Train Loss: 0.1468, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 03:04:35] (step=0196500) Train Loss: 0.1492, Train Steps/Sec: 11.18, Grad Norm: 0.0066 +[2025-02-21 03:04:43] (step=0196600) Train Loss: 0.1463, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 03:04:52] (step=0196700) Train Loss: 0.1486, Train Steps/Sec: 12.10, Grad Norm: 0.0064 +[2025-02-21 03:05:00] (step=0196800) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 03:05:07] (step=0196900) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:05:18] (step=0197000) Train Loss: 0.1487, Train Steps/Sec: 9.95, Grad Norm: 0.0064 +[2025-02-21 03:05:27] (step=0197100) Train Loss: 0.1478, Train Steps/Sec: 10.32, Grad Norm: 0.0064 +[2025-02-21 03:05:37] (step=0197200) Train Loss: 0.1479, Train Steps/Sec: 10.15, Grad Norm: 0.0064 +[2025-02-21 03:05:46] (step=0197300) Train Loss: 0.1475, Train Steps/Sec: 11.42, Grad Norm: 0.0065 +[2025-02-21 03:05:54] (step=0197400) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:06:03] (step=0197500) Train Loss: 0.1491, Train Steps/Sec: 10.31, Grad Norm: 0.0064 +[2025-02-21 03:06:12] (step=0197600) Train Loss: 0.1477, Train Steps/Sec: 11.71, Grad Norm: 0.0065 +[2025-02-21 03:06:21] (step=0197700) Train Loss: 0.1478, Train Steps/Sec: 11.19, Grad Norm: 0.0064 +[2025-02-21 03:06:29] (step=0197800) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:06:37] (step=0197900) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 03:06:45] (step=0198000) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 03:06:53] (step=0198100) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:07:01] (step=0198200) Train Loss: 0.1489, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:07:09] (step=0198300) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:07:17] (step=0198400) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:07:24] (step=0198500) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:07:32] (step=0198600) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:07:40] (step=0198700) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:07:48] (step=0198800) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:07:57] (step=0198900) Train Loss: 0.1496, Train Steps/Sec: 12.08, Grad Norm: 0.0065 +[2025-02-21 03:08:05] (step=0199000) Train Loss: 0.1475, Train Steps/Sec: 11.24, Grad Norm: 0.0064 +[2025-02-21 03:08:13] (step=0199100) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:08:24] (step=0199200) Train Loss: 0.1480, Train Steps/Sec: 9.69, Grad Norm: 0.0064 +[2025-02-21 03:08:34] (step=0199300) Train Loss: 0.1475, Train Steps/Sec: 10.04, Grad Norm: 0.0066 +[2025-02-21 03:08:43] (step=0199400) Train Loss: 0.1469, Train Steps/Sec: 10.40, Grad Norm: 0.0065 +[2025-02-21 03:08:51] (step=0199500) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 03:08:59] (step=0199600) Train Loss: 0.1487, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 03:09:09] (step=0199700) Train Loss: 0.1467, Train Steps/Sec: 10.29, Grad Norm: 0.0063 +[2025-02-21 03:09:17] (step=0199800) Train Loss: 0.1482, Train Steps/Sec: 12.10, Grad Norm: 0.0063 +[2025-02-21 03:09:25] (step=0199900) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:09:33] (step=0200000) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:09:37] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0200000.pt +[2025-02-21 03:09:45] (step=0200100) Train Loss: 0.1473, Train Steps/Sec: 8.39, Grad Norm: 0.0064 +[2025-02-21 03:09:54] (step=0200200) Train Loss: 0.1488, Train Steps/Sec: 11.18, Grad Norm: 0.0064 +[2025-02-21 03:10:02] (step=0200300) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:10:10] (step=0200400) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:10:18] (step=0200500) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 03:10:26] (step=0200600) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 03:10:34] (step=0200700) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 03:10:42] (step=0200800) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 03:10:50] (step=0200900) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:10:58] (step=0201000) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 03:11:06] (step=0201100) Train Loss: 0.1475, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 03:11:14] (step=0201200) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:11:22] (step=0201300) Train Loss: 0.1472, Train Steps/Sec: 12.11, Grad Norm: 0.0063 +[2025-02-21 03:11:32] (step=0201400) Train Loss: 0.1476, Train Steps/Sec: 9.70, Grad Norm: 0.0063 +[2025-02-21 03:11:44] (step=0201500) Train Loss: 0.1482, Train Steps/Sec: 8.28, Grad Norm: 0.0064 +[2025-02-21 03:11:53] (step=0201600) Train Loss: 0.1474, Train Steps/Sec: 11.76, Grad Norm: 0.0063 +[2025-02-21 03:12:01] (step=0201700) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:12:10] (step=0201800) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0065 +[2025-02-21 03:12:19] (step=0201900) Train Loss: 0.1470, Train Steps/Sec: 10.63, Grad Norm: 0.0063 +[2025-02-21 03:12:27] (step=0202000) Train Loss: 0.1471, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 03:12:35] (step=0202100) Train Loss: 0.1490, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:12:43] (step=0202200) Train Loss: 0.1481, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 03:12:51] (step=0202300) Train Loss: 0.1471, Train Steps/Sec: 12.56, Grad Norm: 0.0065 +[2025-02-21 03:12:59] (step=0202400) Train Loss: 0.1484, Train Steps/Sec: 12.57, Grad Norm: 0.0065 +[2025-02-21 03:13:07] (step=0202500) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:13:15] (step=0202600) Train Loss: 0.1491, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:13:24] (step=0202700) Train Loss: 0.1473, Train Steps/Sec: 11.20, Grad Norm: 0.0064 +[2025-02-21 03:13:32] (step=0202800) Train Loss: 0.1485, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:13:40] (step=0202900) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:13:48] (step=0203000) Train Loss: 0.1488, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 03:13:56] (step=0203100) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:14:03] (step=0203200) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:14:12] (step=0203300) Train Loss: 0.1486, Train Steps/Sec: 12.13, Grad Norm: 0.0064 +[2025-02-21 03:14:20] (step=0203400) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:14:28] (step=0203500) Train Loss: 0.1479, Train Steps/Sec: 11.32, Grad Norm: 0.0064 +[2025-02-21 03:14:39] (step=0203600) Train Loss: 0.1469, Train Steps/Sec: 9.25, Grad Norm: 0.0063 +[2025-02-21 03:14:50] (step=0203700) Train Loss: 0.1476, Train Steps/Sec: 9.73, Grad Norm: 0.0062 +[2025-02-21 03:14:58] (step=0203800) Train Loss: 0.1481, Train Steps/Sec: 12.14, Grad Norm: 0.0068 +[2025-02-21 03:15:06] (step=0203900) Train Loss: 0.1461, Train Steps/Sec: 12.54, Grad Norm: 0.0064 +[2025-02-21 03:15:16] (step=0204000) Train Loss: 0.1468, Train Steps/Sec: 10.24, Grad Norm: 0.0063 +[2025-02-21 03:15:25] (step=0204100) Train Loss: 0.1470, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-21 03:15:33] (step=0204200) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 03:15:41] (step=0204300) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 03:15:49] (step=0204400) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 03:15:57] (step=0204500) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 03:16:05] (step=0204600) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:16:13] (step=0204700) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:16:20] (step=0204800) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:16:28] (step=0204900) Train Loss: 0.1479, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 03:16:36] (step=0205000) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:16:44] (step=0205100) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:16:53] (step=0205200) Train Loss: 0.1484, Train Steps/Sec: 11.23, Grad Norm: 0.0064 +[2025-02-21 03:17:01] (step=0205300) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:17:09] (step=0205400) Train Loss: 0.1475, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 03:17:17] (step=0205500) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:17:25] (step=0205600) Train Loss: 0.1494, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 03:17:35] (step=0205700) Train Loss: 0.1468, Train Steps/Sec: 10.34, Grad Norm: 0.0064 +[2025-02-21 03:17:46] (step=0205800) Train Loss: 0.1471, Train Steps/Sec: 9.29, Grad Norm: 0.0062 +[2025-02-21 03:17:55] (step=0205900) Train Loss: 0.1490, Train Steps/Sec: 10.63, Grad Norm: 0.0065 +[2025-02-21 03:18:03] (step=0206000) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:18:11] (step=0206100) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:18:20] (step=0206200) Train Loss: 0.1474, Train Steps/Sec: 11.34, Grad Norm: 0.0063 +[2025-02-21 03:18:29] (step=0206300) Train Loss: 0.1477, Train Steps/Sec: 10.63, Grad Norm: 0.0066 +[2025-02-21 03:18:37] (step=0206400) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 03:18:46] (step=0206500) Train Loss: 0.1465, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-21 03:18:54] (step=0206600) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 03:19:02] (step=0206700) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 03:19:10] (step=0206800) Train Loss: 0.1481, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 03:19:18] (step=0206900) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:19:26] (step=0207000) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 03:19:34] (step=0207100) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 03:19:41] (step=0207200) Train Loss: 0.1475, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 03:19:49] (step=0207300) Train Loss: 0.1478, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 03:19:57] (step=0207400) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 03:20:05] (step=0207500) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 03:20:13] (step=0207600) Train Loss: 0.1457, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 03:20:22] (step=0207700) Train Loss: 0.1473, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-21 03:20:31] (step=0207800) Train Loss: 0.1485, Train Steps/Sec: 12.14, Grad Norm: 0.0066 +[2025-02-21 03:20:42] (step=0207900) Train Loss: 0.1471, Train Steps/Sec: 9.17, Grad Norm: 0.0064 +[2025-02-21 03:20:51] (step=0208000) Train Loss: 0.1463, Train Steps/Sec: 10.05, Grad Norm: 0.0063 +[2025-02-21 03:21:00] (step=0208100) Train Loss: 0.1490, Train Steps/Sec: 11.36, Grad Norm: 0.0066 +[2025-02-21 03:21:08] (step=0208200) Train Loss: 0.1489, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:21:16] (step=0208300) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:21:25] (step=0208400) Train Loss: 0.1489, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 03:21:34] (step=0208500) Train Loss: 0.1488, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-21 03:21:42] (step=0208600) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 03:21:50] (step=0208700) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 03:21:58] (step=0208800) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:22:06] (step=0208900) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 03:22:15] (step=0209000) Train Loss: 0.1480, Train Steps/Sec: 11.26, Grad Norm: 0.0064 +[2025-02-21 03:22:23] (step=0209100) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:22:31] (step=0209200) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:22:39] (step=0209300) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:22:47] (step=0209400) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:22:54] (step=0209500) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 03:23:02] (step=0209600) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:23:10] (step=0209700) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 03:23:19] (step=0209800) Train Loss: 0.1481, Train Steps/Sec: 12.13, Grad Norm: 0.0065 +[2025-02-21 03:23:27] (step=0209900) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 03:23:36] (step=0210000) Train Loss: 0.1476, Train Steps/Sec: 10.74, Grad Norm: 0.0062 +[2025-02-21 03:23:46] (step=0210100) Train Loss: 0.1465, Train Steps/Sec: 9.43, Grad Norm: 0.0065 +[2025-02-21 03:23:57] (step=0210200) Train Loss: 0.1483, Train Steps/Sec: 9.35, Grad Norm: 0.0061 +[2025-02-21 03:24:06] (step=0210300) Train Loss: 0.1485, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 03:24:14] (step=0210400) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:24:22] (step=0210500) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 03:24:30] (step=0210600) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 03:24:39] (step=0210700) Train Loss: 0.1481, Train Steps/Sec: 10.64, Grad Norm: 0.0064 +[2025-02-21 03:24:47] (step=0210800) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 03:24:55] (step=0210900) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 03:25:03] (step=0211000) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:25:11] (step=0211100) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:25:19] (step=0211200) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:25:27] (step=0211300) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:25:35] (step=0211400) Train Loss: 0.1480, Train Steps/Sec: 12.55, Grad Norm: 0.0063 +[2025-02-21 03:25:44] (step=0211500) Train Loss: 0.1474, Train Steps/Sec: 11.14, Grad Norm: 0.0061 +[2025-02-21 03:25:52] (step=0211600) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:26:00] (step=0211700) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:26:08] (step=0211800) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 03:26:16] (step=0211900) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:26:24] (step=0212000) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 03:26:32] (step=0212100) Train Loss: 0.1487, Train Steps/Sec: 11.83, Grad Norm: 0.0064 +[2025-02-21 03:26:42] (step=0212200) Train Loss: 0.1481, Train Steps/Sec: 10.45, Grad Norm: 0.0063 +[2025-02-21 03:26:52] (step=0212300) Train Loss: 0.1475, Train Steps/Sec: 10.27, Grad Norm: 0.0063 +[2025-02-21 03:27:02] (step=0212400) Train Loss: 0.1484, Train Steps/Sec: 10.24, Grad Norm: 0.0062 +[2025-02-21 03:27:10] (step=0212500) Train Loss: 0.1476, Train Steps/Sec: 11.75, Grad Norm: 0.0063 +[2025-02-21 03:27:18] (step=0212600) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:27:27] (step=0212700) Train Loss: 0.1464, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 03:27:35] (step=0212800) Train Loss: 0.1482, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 03:27:45] (step=0212900) Train Loss: 0.1476, Train Steps/Sec: 10.62, Grad Norm: 0.0064 +[2025-02-21 03:27:53] (step=0213000) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 03:28:01] (step=0213100) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 03:28:09] (step=0213200) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:28:17] (step=0213300) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 03:28:24] (step=0213400) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 03:28:32] (step=0213500) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 03:28:40] (step=0213600) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:28:48] (step=0213700) Train Loss: 0.1482, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 03:28:56] (step=0213800) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 03:29:04] (step=0213900) Train Loss: 0.1474, Train Steps/Sec: 12.54, Grad Norm: 0.0064 +[2025-02-21 03:29:13] (step=0214000) Train Loss: 0.1485, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 03:29:21] (step=0214100) Train Loss: 0.1477, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 03:29:29] (step=0214200) Train Loss: 0.1472, Train Steps/Sec: 12.13, Grad Norm: 0.0065 +[2025-02-21 03:29:39] (step=0214300) Train Loss: 0.1476, Train Steps/Sec: 10.37, Grad Norm: 0.0064 +[2025-02-21 03:29:48] (step=0214400) Train Loss: 0.1468, Train Steps/Sec: 11.04, Grad Norm: 0.0062 +[2025-02-21 03:29:57] (step=0214500) Train Loss: 0.1478, Train Steps/Sec: 10.55, Grad Norm: 0.0065 +[2025-02-21 03:30:07] (step=0214600) Train Loss: 0.1470, Train Steps/Sec: 10.34, Grad Norm: 0.0065 +[2025-02-21 03:30:15] (step=0214700) Train Loss: 0.1475, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 03:30:23] (step=0214800) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 03:30:31] (step=0214900) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:30:40] (step=0215000) Train Loss: 0.1476, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 03:30:49] (step=0215100) Train Loss: 0.1456, Train Steps/Sec: 10.63, Grad Norm: 0.0065 +[2025-02-21 03:30:58] (step=0215200) Train Loss: 0.1468, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-21 03:31:06] (step=0215300) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 03:31:14] (step=0215400) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:31:22] (step=0215500) Train Loss: 0.1487, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:31:30] (step=0215600) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:31:38] (step=0215700) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:31:46] (step=0215800) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:31:54] (step=0215900) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 03:32:02] (step=0216000) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:32:10] (step=0216100) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 03:32:17] (step=0216200) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:32:25] (step=0216300) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:32:35] (step=0216400) Train Loss: 0.1478, Train Steps/Sec: 10.69, Grad Norm: 0.0066 +[2025-02-21 03:32:45] (step=0216500) Train Loss: 0.1481, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-21 03:32:54] (step=0216600) Train Loss: 0.1489, Train Steps/Sec: 11.27, Grad Norm: 0.0064 +[2025-02-21 03:33:03] (step=0216700) Train Loss: 0.1483, Train Steps/Sec: 10.24, Grad Norm: 0.0064 +[2025-02-21 03:33:13] (step=0216800) Train Loss: 0.1490, Train Steps/Sec: 10.29, Grad Norm: 0.0065 +[2025-02-21 03:33:21] (step=0216900) Train Loss: 0.1481, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 03:33:29] (step=0217000) Train Loss: 0.1476, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 03:33:37] (step=0217100) Train Loss: 0.1467, Train Steps/Sec: 12.56, Grad Norm: 0.0065 +[2025-02-21 03:33:46] (step=0217200) Train Loss: 0.1480, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-21 03:33:55] (step=0217300) Train Loss: 0.1472, Train Steps/Sec: 10.63, Grad Norm: 0.0060 +[2025-02-21 03:34:03] (step=0217400) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 03:34:11] (step=0217500) Train Loss: 0.1456, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:34:19] (step=0217600) Train Loss: 0.1484, Train Steps/Sec: 12.56, Grad Norm: 0.0064 +[2025-02-21 03:34:28] (step=0217700) Train Loss: 0.1486, Train Steps/Sec: 11.17, Grad Norm: 0.0065 +[2025-02-21 03:34:36] (step=0217800) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 03:34:44] (step=0217900) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 03:34:52] (step=0218000) Train Loss: 0.1480, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:35:00] (step=0218100) Train Loss: 0.1479, Train Steps/Sec: 12.56, Grad Norm: 0.0064 +[2025-02-21 03:35:08] (step=0218200) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:35:16] (step=0218300) Train Loss: 0.1493, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 03:35:24] (step=0218400) Train Loss: 0.1480, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 03:35:32] (step=0218500) Train Loss: 0.1484, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 03:35:41] (step=0218600) Train Loss: 0.1472, Train Steps/Sec: 10.73, Grad Norm: 0.0065 +[2025-02-21 03:35:50] (step=0218700) Train Loss: 0.1494, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 03:35:59] (step=0218800) Train Loss: 0.1482, Train Steps/Sec: 11.33, Grad Norm: 0.0062 +[2025-02-21 03:36:08] (step=0218900) Train Loss: 0.1479, Train Steps/Sec: 10.52, Grad Norm: 0.0063 +[2025-02-21 03:36:19] (step=0219000) Train Loss: 0.1476, Train Steps/Sec: 9.33, Grad Norm: 0.0065 +[2025-02-21 03:36:27] (step=0219100) Train Loss: 0.1476, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 03:36:35] (step=0219200) Train Loss: 0.1484, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 03:36:43] (step=0219300) Train Loss: 0.1465, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 03:36:52] (step=0219400) Train Loss: 0.1465, Train Steps/Sec: 10.96, Grad Norm: 0.0063 +[2025-02-21 03:37:01] (step=0219500) Train Loss: 0.1456, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 03:37:09] (step=0219600) Train Loss: 0.1481, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:37:17] (step=0219700) Train Loss: 0.1477, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 03:37:25] (step=0219800) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:37:33] (step=0219900) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:37:41] (step=0220000) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:37:45] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0220000.pt +[2025-02-21 03:37:53] (step=0220100) Train Loss: 0.1475, Train Steps/Sec: 7.99, Grad Norm: 0.0063 +[2025-02-21 03:38:02] (step=0220200) Train Loss: 0.1447, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 03:38:10] (step=0220300) Train Loss: 0.1477, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 03:38:18] (step=0220400) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:38:26] (step=0220500) Train Loss: 0.1482, Train Steps/Sec: 12.57, Grad Norm: 0.0066 +[2025-02-21 03:38:34] (step=0220600) Train Loss: 0.1472, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 03:38:44] (step=0220700) Train Loss: 0.1483, Train Steps/Sec: 10.68, Grad Norm: 0.0072 +[2025-02-21 03:38:53] (step=0220800) Train Loss: 0.1477, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-21 03:39:00] (step=0220900) Train Loss: 0.1490, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:39:10] (step=0221000) Train Loss: 0.1488, Train Steps/Sec: 10.98, Grad Norm: 0.0063 +[2025-02-21 03:39:19] (step=0221100) Train Loss: 0.1469, Train Steps/Sec: 10.61, Grad Norm: 0.0064 +[2025-02-21 03:39:28] (step=0221200) Train Loss: 0.1463, Train Steps/Sec: 10.65, Grad Norm: 0.0060 +[2025-02-21 03:39:36] (step=0221300) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 03:39:44] (step=0221400) Train Loss: 0.1483, Train Steps/Sec: 12.52, Grad Norm: 0.0064 +[2025-02-21 03:39:53] (step=0221500) Train Loss: 0.1484, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-21 03:40:03] (step=0221600) Train Loss: 0.1481, Train Steps/Sec: 10.60, Grad Norm: 0.0067 +[2025-02-21 03:40:11] (step=0221700) Train Loss: 0.1473, Train Steps/Sec: 11.32, Grad Norm: 0.0063 +[2025-02-21 03:40:19] (step=0221800) Train Loss: 0.1486, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:40:27] (step=0221900) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 03:40:35] (step=0222000) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 03:40:43] (step=0222100) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 03:40:51] (step=0222200) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 03:40:59] (step=0222300) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 03:41:07] (step=0222400) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:41:15] (step=0222500) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:41:23] (step=0222600) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 03:41:32] (step=0222700) Train Loss: 0.1475, Train Steps/Sec: 11.20, Grad Norm: 0.0062 +[2025-02-21 03:41:41] (step=0222800) Train Loss: 0.1464, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 03:41:50] (step=0222900) Train Loss: 0.1465, Train Steps/Sec: 10.29, Grad Norm: 0.0062 +[2025-02-21 03:41:58] (step=0223000) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:42:06] (step=0223100) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:42:16] (step=0223200) Train Loss: 0.1471, Train Steps/Sec: 10.27, Grad Norm: 0.0063 +[2025-02-21 03:42:25] (step=0223300) Train Loss: 0.1486, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 03:42:34] (step=0223400) Train Loss: 0.1461, Train Steps/Sec: 10.63, Grad Norm: 0.0066 +[2025-02-21 03:42:42] (step=0223500) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 03:42:50] (step=0223600) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:42:58] (step=0223700) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:43:07] (step=0223800) Train Loss: 0.1462, Train Steps/Sec: 10.60, Grad Norm: 0.0062 +[2025-02-21 03:43:16] (step=0223900) Train Loss: 0.1475, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 03:43:25] (step=0224000) Train Loss: 0.1480, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-21 03:43:33] (step=0224100) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:43:41] (step=0224200) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:43:49] (step=0224300) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:43:57] (step=0224400) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:44:05] (step=0224500) Train Loss: 0.1487, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 03:44:13] (step=0224600) Train Loss: 0.1490, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:44:21] (step=0224700) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:44:29] (step=0224800) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:44:37] (step=0224900) Train Loss: 0.1476, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 03:44:46] (step=0225000) Train Loss: 0.1473, Train Steps/Sec: 10.71, Grad Norm: 0.0065 +[2025-02-21 03:44:55] (step=0225100) Train Loss: 0.1476, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 03:45:04] (step=0225200) Train Loss: 0.1477, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 03:45:12] (step=0225300) Train Loss: 0.1471, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 03:45:21] (step=0225400) Train Loss: 0.1483, Train Steps/Sec: 10.56, Grad Norm: 0.0064 +[2025-02-21 03:45:31] (step=0225500) Train Loss: 0.1474, Train Steps/Sec: 10.62, Grad Norm: 0.0064 +[2025-02-21 03:45:40] (step=0225600) Train Loss: 0.1473, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 03:45:48] (step=0225700) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 03:45:55] (step=0225800) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 03:46:03] (step=0225900) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 03:46:13] (step=0226000) Train Loss: 0.1479, Train Steps/Sec: 10.30, Grad Norm: 0.0063 +[2025-02-21 03:46:22] (step=0226100) Train Loss: 0.1487, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-21 03:46:30] (step=0226200) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 03:46:38] (step=0226300) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0065 +[2025-02-21 03:46:46] (step=0226400) Train Loss: 0.1467, Train Steps/Sec: 12.52, Grad Norm: 0.0065 +[2025-02-21 03:46:54] (step=0226500) Train Loss: 0.1468, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-21 03:47:02] (step=0226600) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:47:10] (step=0226700) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:47:18] (step=0226800) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:47:26] (step=0226900) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:47:34] (step=0227000) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:47:43] (step=0227100) Train Loss: 0.1464, Train Steps/Sec: 11.11, Grad Norm: 0.0064 +[2025-02-21 03:47:53] (step=0227200) Train Loss: 0.1467, Train Steps/Sec: 10.60, Grad Norm: 0.0063 +[2025-02-21 03:48:00] (step=0227300) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 03:48:08] (step=0227400) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 03:48:16] (step=0227500) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 03:48:26] (step=0227600) Train Loss: 0.1460, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-21 03:48:37] (step=0227700) Train Loss: 0.1478, Train Steps/Sec: 9.35, Grad Norm: 0.0063 +[2025-02-21 03:48:45] (step=0227800) Train Loss: 0.1477, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 03:48:53] (step=0227900) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 03:49:01] (step=0228000) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:49:09] (step=0228100) Train Loss: 0.1490, Train Steps/Sec: 12.13, Grad Norm: 0.0064 +[2025-02-21 03:49:19] (step=0228200) Train Loss: 0.1483, Train Steps/Sec: 10.32, Grad Norm: 0.0063 +[2025-02-21 03:49:27] (step=0228300) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 03:49:35] (step=0228400) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:49:43] (step=0228500) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 03:49:51] (step=0228600) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:49:59] (step=0228700) Train Loss: 0.1499, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 03:50:07] (step=0228800) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:50:15] (step=0228900) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 03:50:24] (step=0229000) Train Loss: 0.1470, Train Steps/Sec: 11.21, Grad Norm: 0.0065 +[2025-02-21 03:50:32] (step=0229100) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:50:40] (step=0229200) Train Loss: 0.1478, Train Steps/Sec: 11.80, Grad Norm: 0.0063 +[2025-02-21 03:50:49] (step=0229300) Train Loss: 0.1459, Train Steps/Sec: 10.67, Grad Norm: 0.0063 +[2025-02-21 03:50:58] (step=0229400) Train Loss: 0.1477, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 03:51:06] (step=0229500) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:51:14] (step=0229600) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:51:22] (step=0229700) Train Loss: 0.1474, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 03:51:31] (step=0229800) Train Loss: 0.1464, Train Steps/Sec: 10.96, Grad Norm: 0.0063 +[2025-02-21 03:51:41] (step=0229900) Train Loss: 0.1481, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 03:51:49] (step=0230000) Train Loss: 0.1477, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 03:51:57] (step=0230100) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 03:52:06] (step=0230200) Train Loss: 0.1472, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-21 03:52:15] (step=0230300) Train Loss: 0.1470, Train Steps/Sec: 11.67, Grad Norm: 0.0063 +[2025-02-21 03:52:24] (step=0230400) Train Loss: 0.1472, Train Steps/Sec: 10.29, Grad Norm: 0.0065 +[2025-02-21 03:52:32] (step=0230500) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 03:52:40] (step=0230600) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:52:48] (step=0230700) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 03:52:56] (step=0230800) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 03:53:04] (step=0230900) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:53:12] (step=0231000) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 03:53:20] (step=0231100) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 03:53:28] (step=0231200) Train Loss: 0.1468, Train Steps/Sec: 12.53, Grad Norm: 0.0063 +[2025-02-21 03:53:37] (step=0231300) Train Loss: 0.1469, Train Steps/Sec: 11.78, Grad Norm: 0.0061 +[2025-02-21 03:53:46] (step=0231400) Train Loss: 0.1486, Train Steps/Sec: 10.94, Grad Norm: 0.0065 +[2025-02-21 03:53:55] (step=0231500) Train Loss: 0.1470, Train Steps/Sec: 10.44, Grad Norm: 0.0062 +[2025-02-21 03:54:04] (step=0231600) Train Loss: 0.1481, Train Steps/Sec: 12.10, Grad Norm: 0.0062 +[2025-02-21 03:54:11] (step=0231700) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:54:19] (step=0231800) Train Loss: 0.1486, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 03:54:28] (step=0231900) Train Loss: 0.1466, Train Steps/Sec: 11.73, Grad Norm: 0.0063 +[2025-02-21 03:54:38] (step=0232000) Train Loss: 0.1475, Train Steps/Sec: 10.31, Grad Norm: 0.0064 +[2025-02-21 03:54:47] (step=0232100) Train Loss: 0.1484, Train Steps/Sec: 11.01, Grad Norm: 0.0065 +[2025-02-21 03:54:55] (step=0232200) Train Loss: 0.1459, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 03:55:03] (step=0232300) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 03:55:11] (step=0232400) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 03:55:19] (step=0232500) Train Loss: 0.1468, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 03:55:29] (step=0232600) Train Loss: 0.1464, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-21 03:55:38] (step=0232700) Train Loss: 0.1467, Train Steps/Sec: 11.22, Grad Norm: 0.0065 +[2025-02-21 03:55:46] (step=0232800) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:55:54] (step=0232900) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 03:56:02] (step=0233000) Train Loss: 0.1490, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 03:56:10] (step=0233100) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 03:56:18] (step=0233200) Train Loss: 0.1480, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 03:56:26] (step=0233300) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 03:56:34] (step=0233400) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:56:43] (step=0233500) Train Loss: 0.1478, Train Steps/Sec: 11.08, Grad Norm: 0.0062 +[2025-02-21 03:56:51] (step=0233600) Train Loss: 0.1482, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 03:57:00] (step=0233700) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0064 +[2025-02-21 03:57:08] (step=0233800) Train Loss: 0.1468, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 03:57:16] (step=0233900) Train Loss: 0.1476, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 03:57:25] (step=0234000) Train Loss: 0.1490, Train Steps/Sec: 11.22, Grad Norm: 0.0065 +[2025-02-21 03:57:34] (step=0234100) Train Loss: 0.1484, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-21 03:57:43] (step=0234200) Train Loss: 0.1470, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-21 03:57:53] (step=0234300) Train Loss: 0.1488, Train Steps/Sec: 10.65, Grad Norm: 0.0063 +[2025-02-21 03:58:01] (step=0234400) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 03:58:09] (step=0234500) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 03:58:17] (step=0234600) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 03:58:25] (step=0234700) Train Loss: 0.1471, Train Steps/Sec: 11.66, Grad Norm: 0.0065 +[2025-02-21 03:58:35] (step=0234800) Train Loss: 0.1483, Train Steps/Sec: 10.29, Grad Norm: 0.0063 +[2025-02-21 03:58:43] (step=0234900) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 03:58:51] (step=0235000) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 03:58:59] (step=0235100) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 03:59:08] (step=0235200) Train Loss: 0.1484, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-21 03:59:16] (step=0235300) Train Loss: 0.1478, Train Steps/Sec: 12.47, Grad Norm: 0.0062 +[2025-02-21 03:59:24] (step=0235400) Train Loss: 0.1473, Train Steps/Sec: 12.52, Grad Norm: 0.0062 +[2025-02-21 03:59:32] (step=0235500) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 03:59:40] (step=0235600) Train Loss: 0.1469, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-21 03:59:49] (step=0235700) Train Loss: 0.1481, Train Steps/Sec: 11.02, Grad Norm: 0.0063 +[2025-02-21 03:59:58] (step=0235800) Train Loss: 0.1465, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 04:00:06] (step=0235900) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 04:00:14] (step=0236000) Train Loss: 0.1473, Train Steps/Sec: 12.09, Grad Norm: 0.0060 +[2025-02-21 04:00:22] (step=0236100) Train Loss: 0.1475, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 04:00:30] (step=0236200) Train Loss: 0.1487, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 04:00:39] (step=0236300) Train Loss: 0.1468, Train Steps/Sec: 10.88, Grad Norm: 0.0064 +[2025-02-21 04:00:49] (step=0236400) Train Loss: 0.1483, Train Steps/Sec: 10.61, Grad Norm: 0.0062 +[2025-02-21 04:00:59] (step=0236500) Train Loss: 0.1468, Train Steps/Sec: 9.66, Grad Norm: 0.0061 +[2025-02-21 04:01:07] (step=0236600) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:01:15] (step=0236700) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:01:23] (step=0236800) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:01:32] (step=0236900) Train Loss: 0.1473, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-21 04:01:41] (step=0237000) Train Loss: 0.1474, Train Steps/Sec: 10.65, Grad Norm: 0.0062 +[2025-02-21 04:01:49] (step=0237100) Train Loss: 0.1483, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 04:01:57] (step=0237200) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:02:05] (step=0237300) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:02:13] (step=0237400) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 04:02:21] (step=0237500) Train Loss: 0.1486, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:02:29] (step=0237600) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:02:38] (step=0237700) Train Loss: 0.1473, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-21 04:02:47] (step=0237800) Train Loss: 0.1484, Train Steps/Sec: 10.69, Grad Norm: 0.0063 +[2025-02-21 04:02:56] (step=0237900) Train Loss: 0.1468, Train Steps/Sec: 11.69, Grad Norm: 0.0063 +[2025-02-21 04:03:04] (step=0238000) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:03:12] (step=0238100) Train Loss: 0.1471, Train Steps/Sec: 12.17, Grad Norm: 0.0067 +[2025-02-21 04:03:20] (step=0238200) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:03:28] (step=0238300) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:03:36] (step=0238400) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:03:45] (step=0238500) Train Loss: 0.1470, Train Steps/Sec: 10.34, Grad Norm: 0.0063 +[2025-02-21 04:03:54] (step=0238600) Train Loss: 0.1482, Train Steps/Sec: 10.97, Grad Norm: 0.0064 +[2025-02-21 04:04:04] (step=0238700) Train Loss: 0.1479, Train Steps/Sec: 10.97, Grad Norm: 0.0061 +[2025-02-21 04:04:12] (step=0238800) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:04:19] (step=0238900) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 04:04:28] (step=0239000) Train Loss: 0.1475, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-21 04:04:38] (step=0239100) Train Loss: 0.1473, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 04:04:47] (step=0239200) Train Loss: 0.1483, Train Steps/Sec: 10.93, Grad Norm: 0.0063 +[2025-02-21 04:04:55] (step=0239300) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 04:05:03] (step=0239400) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 04:05:10] (step=0239500) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 04:05:18] (step=0239600) Train Loss: 0.1493, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 04:05:26] (step=0239700) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 04:05:34] (step=0239800) Train Loss: 0.1461, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 04:05:43] (step=0239900) Train Loss: 0.1486, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 04:05:52] (step=0240000) Train Loss: 0.1468, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 04:05:54] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0240000.pt +[2025-02-21 04:06:03] (step=0240100) Train Loss: 0.1477, Train Steps/Sec: 9.08, Grad Norm: 0.0063 +[2025-02-21 04:06:12] (step=0240200) Train Loss: 0.1467, Train Steps/Sec: 11.17, Grad Norm: 0.0064 +[2025-02-21 04:06:20] (step=0240300) Train Loss: 0.1468, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 04:06:28] (step=0240400) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:06:36] (step=0240500) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:06:44] (step=0240600) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:06:54] (step=0240700) Train Loss: 0.1470, Train Steps/Sec: 10.03, Grad Norm: 0.0061 +[2025-02-21 04:07:03] (step=0240800) Train Loss: 0.1487, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-21 04:07:12] (step=0240900) Train Loss: 0.1494, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 04:07:20] (step=0241000) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 04:07:28] (step=0241100) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:07:36] (step=0241200) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 04:07:45] (step=0241300) Train Loss: 0.1459, Train Steps/Sec: 11.32, Grad Norm: 0.0064 +[2025-02-21 04:07:54] (step=0241400) Train Loss: 0.1461, Train Steps/Sec: 10.62, Grad Norm: 0.0064 +[2025-02-21 04:08:03] (step=0241500) Train Loss: 0.1483, Train Steps/Sec: 11.19, Grad Norm: 0.0063 +[2025-02-21 04:08:11] (step=0241600) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 04:08:19] (step=0241700) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 04:08:27] (step=0241800) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:08:35] (step=0241900) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 04:08:44] (step=0242000) Train Loss: 0.1474, Train Steps/Sec: 11.77, Grad Norm: 0.0064 +[2025-02-21 04:08:53] (step=0242100) Train Loss: 0.1478, Train Steps/Sec: 11.02, Grad Norm: 0.0064 +[2025-02-21 04:09:01] (step=0242200) Train Loss: 0.1476, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 04:09:09] (step=0242300) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:09:17] (step=0242400) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:09:25] (step=0242500) Train Loss: 0.1480, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 04:09:33] (step=0242600) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 04:09:42] (step=0242700) Train Loss: 0.1471, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-21 04:09:50] (step=0242800) Train Loss: 0.1466, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 04:10:00] (step=0242900) Train Loss: 0.1463, Train Steps/Sec: 9.96, Grad Norm: 0.0063 +[2025-02-21 04:10:09] (step=0243000) Train Loss: 0.1474, Train Steps/Sec: 11.32, Grad Norm: 0.0062 +[2025-02-21 04:10:18] (step=0243100) Train Loss: 0.1475, Train Steps/Sec: 11.32, Grad Norm: 0.0064 +[2025-02-21 04:10:26] (step=0243200) Train Loss: 0.1488, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 04:10:34] (step=0243300) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 04:10:42] (step=0243400) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:10:51] (step=0243500) Train Loss: 0.1469, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-21 04:11:00] (step=0243600) Train Loss: 0.1474, Train Steps/Sec: 11.33, Grad Norm: 0.0065 +[2025-02-21 04:11:08] (step=0243700) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:11:16] (step=0243800) Train Loss: 0.1476, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 04:11:24] (step=0243900) Train Loss: 0.1476, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:11:33] (step=0244000) Train Loss: 0.1455, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 04:11:41] (step=0244100) Train Loss: 0.1476, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 04:11:50] (step=0244200) Train Loss: 0.1468, Train Steps/Sec: 10.66, Grad Norm: 0.0062 +[2025-02-21 04:11:59] (step=0244300) Train Loss: 0.1472, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 04:12:07] (step=0244400) Train Loss: 0.1472, Train Steps/Sec: 12.53, Grad Norm: 0.0060 +[2025-02-21 04:12:15] (step=0244500) Train Loss: 0.1481, Train Steps/Sec: 12.53, Grad Norm: 0.0060 +[2025-02-21 04:12:23] (step=0244600) Train Loss: 0.1474, Train Steps/Sec: 12.54, Grad Norm: 0.0063 +[2025-02-21 04:12:31] (step=0244700) Train Loss: 0.1458, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 04:12:39] (step=0244800) Train Loss: 0.1468, Train Steps/Sec: 12.55, Grad Norm: 0.0063 +[2025-02-21 04:12:47] (step=0244900) Train Loss: 0.1469, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 04:12:55] (step=0245000) Train Loss: 0.1466, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 04:13:06] (step=0245100) Train Loss: 0.1465, Train Steps/Sec: 9.66, Grad Norm: 0.0063 +[2025-02-21 04:13:16] (step=0245200) Train Loss: 0.1468, Train Steps/Sec: 10.15, Grad Norm: 0.0065 +[2025-02-21 04:13:25] (step=0245300) Train Loss: 0.1474, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 04:13:33] (step=0245400) Train Loss: 0.1469, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 04:13:40] (step=0245500) Train Loss: 0.1474, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:13:49] (step=0245600) Train Loss: 0.1480, Train Steps/Sec: 12.10, Grad Norm: 0.0060 +[2025-02-21 04:13:58] (step=0245700) Train Loss: 0.1475, Train Steps/Sec: 10.98, Grad Norm: 0.0063 +[2025-02-21 04:14:07] (step=0245800) Train Loss: 0.1464, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-21 04:14:15] (step=0245900) Train Loss: 0.1471, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 04:14:23] (step=0246000) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 04:14:31] (step=0246100) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:14:38] (step=0246200) Train Loss: 0.1471, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 04:14:47] (step=0246300) Train Loss: 0.1468, Train Steps/Sec: 11.37, Grad Norm: 0.0063 +[2025-02-21 04:14:56] (step=0246400) Train Loss: 0.1468, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 04:15:06] (step=0246500) Train Loss: 0.1477, Train Steps/Sec: 10.55, Grad Norm: 0.0062 +[2025-02-21 04:15:14] (step=0246600) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:15:21] (step=0246700) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:15:29] (step=0246800) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:15:38] (step=0246900) Train Loss: 0.1481, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 04:15:46] (step=0247000) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 04:15:53] (step=0247100) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:16:03] (step=0247200) Train Loss: 0.1466, Train Steps/Sec: 10.67, Grad Norm: 0.0062 +[2025-02-21 04:16:12] (step=0247300) Train Loss: 0.1468, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 04:16:21] (step=0247400) Train Loss: 0.1466, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-21 04:16:30] (step=0247500) Train Loss: 0.1470, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 04:16:38] (step=0247600) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:16:47] (step=0247700) Train Loss: 0.1468, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 04:16:55] (step=0247800) Train Loss: 0.1471, Train Steps/Sec: 12.08, Grad Norm: 0.0063 +[2025-02-21 04:17:04] (step=0247900) Train Loss: 0.1471, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-21 04:17:13] (step=0248000) Train Loss: 0.1475, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 04:17:21] (step=0248100) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 04:17:29] (step=0248200) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:17:37] (step=0248300) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:17:45] (step=0248400) Train Loss: 0.1471, Train Steps/Sec: 11.80, Grad Norm: 0.0061 +[2025-02-21 04:17:54] (step=0248500) Train Loss: 0.1484, Train Steps/Sec: 11.03, Grad Norm: 0.0063 +[2025-02-21 04:18:03] (step=0248600) Train Loss: 0.1486, Train Steps/Sec: 11.72, Grad Norm: 0.0064 +[2025-02-21 04:18:11] (step=0248700) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:18:19] (step=0248800) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:18:27] (step=0248900) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 04:18:36] (step=0249000) Train Loss: 0.1468, Train Steps/Sec: 10.83, Grad Norm: 0.0061 +[2025-02-21 04:18:44] (step=0249100) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:18:52] (step=0249200) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 04:19:00] (step=0249300) Train Loss: 0.1490, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:19:09] (step=0249400) Train Loss: 0.1479, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-21 04:19:18] (step=0249500) Train Loss: 0.1476, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 04:19:28] (step=0249600) Train Loss: 0.1467, Train Steps/Sec: 10.60, Grad Norm: 0.0062 +[2025-02-21 04:19:36] (step=0249700) Train Loss: 0.1465, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 04:19:44] (step=0249800) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:19:52] (step=0249900) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:20:00] (step=0250000) Train Loss: 0.1472, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 04:20:09] (step=0250100) Train Loss: 0.1477, Train Steps/Sec: 10.60, Grad Norm: 0.0063 +[2025-02-21 04:20:18] (step=0250200) Train Loss: 0.1467, Train Steps/Sec: 11.14, Grad Norm: 0.0062 +[2025-02-21 04:20:27] (step=0250300) Train Loss: 0.1472, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 04:20:35] (step=0250400) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 04:20:43] (step=0250500) Train Loss: 0.1491, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 04:20:52] (step=0250600) Train Loss: 0.1474, Train Steps/Sec: 11.03, Grad Norm: 0.0063 +[2025-02-21 04:21:01] (step=0250700) Train Loss: 0.1473, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 04:21:09] (step=0250800) Train Loss: 0.1479, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 04:21:17] (step=0250900) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 04:21:25] (step=0251000) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:21:33] (step=0251100) Train Loss: 0.1470, Train Steps/Sec: 12.56, Grad Norm: 0.0064 +[2025-02-21 04:21:41] (step=0251200) Train Loss: 0.1468, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 04:21:49] (step=0251300) Train Loss: 0.1483, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 04:21:57] (step=0251400) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 04:22:06] (step=0251500) Train Loss: 0.1459, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-21 04:22:15] (step=0251600) Train Loss: 0.1467, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-21 04:22:25] (step=0251700) Train Loss: 0.1484, Train Steps/Sec: 10.55, Grad Norm: 0.0064 +[2025-02-21 04:22:34] (step=0251800) Train Loss: 0.1485, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-21 04:22:42] (step=0251900) Train Loss: 0.1486, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:22:50] (step=0252000) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 04:22:58] (step=0252100) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 04:23:06] (step=0252200) Train Loss: 0.1463, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 04:23:16] (step=0252300) Train Loss: 0.1466, Train Steps/Sec: 10.34, Grad Norm: 0.0062 +[2025-02-21 04:23:24] (step=0252400) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 04:23:32] (step=0252500) Train Loss: 0.1477, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:23:40] (step=0252600) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 04:23:48] (step=0252700) Train Loss: 0.1473, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 04:23:58] (step=0252800) Train Loss: 0.1482, Train Steps/Sec: 9.93, Grad Norm: 0.0062 +[2025-02-21 04:24:07] (step=0252900) Train Loss: 0.1480, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 04:24:14] (step=0253000) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 04:24:22] (step=0253100) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:24:30] (step=0253200) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:24:38] (step=0253300) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:24:46] (step=0253400) Train Loss: 0.1479, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 04:24:54] (step=0253500) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:25:02] (step=0253600) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:25:10] (step=0253700) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:25:20] (step=0253800) Train Loss: 0.1484, Train Steps/Sec: 9.96, Grad Norm: 0.0064 +[2025-02-21 04:25:30] (step=0253900) Train Loss: 0.1474, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-21 04:25:40] (step=0254000) Train Loss: 0.1466, Train Steps/Sec: 9.84, Grad Norm: 0.0059 +[2025-02-21 04:25:48] (step=0254100) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:25:56] (step=0254200) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:26:04] (step=0254300) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:26:12] (step=0254400) Train Loss: 0.1476, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 04:26:21] (step=0254500) Train Loss: 0.1474, Train Steps/Sec: 10.30, Grad Norm: 0.0064 +[2025-02-21 04:26:30] (step=0254600) Train Loss: 0.1479, Train Steps/Sec: 12.10, Grad Norm: 0.0064 +[2025-02-21 04:26:38] (step=0254700) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:26:46] (step=0254800) Train Loss: 0.1490, Train Steps/Sec: 11.81, Grad Norm: 0.0062 +[2025-02-21 04:26:55] (step=0254900) Train Loss: 0.1463, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 04:27:04] (step=0255000) Train Loss: 0.1480, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 04:27:11] (step=0255100) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:27:19] (step=0255200) Train Loss: 0.1490, Train Steps/Sec: 12.55, Grad Norm: 0.0064 +[2025-02-21 04:27:28] (step=0255300) Train Loss: 0.1474, Train Steps/Sec: 11.22, Grad Norm: 0.0064 +[2025-02-21 04:27:36] (step=0255400) Train Loss: 0.1480, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 04:27:44] (step=0255500) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 04:27:52] (step=0255600) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 04:28:00] (step=0255700) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:28:08] (step=0255800) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:28:16] (step=0255900) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:28:26] (step=0256000) Train Loss: 0.1472, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-21 04:28:35] (step=0256100) Train Loss: 0.1468, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-21 04:28:45] (step=0256200) Train Loss: 0.1459, Train Steps/Sec: 10.96, Grad Norm: 0.0065 +[2025-02-21 04:28:52] (step=0256300) Train Loss: 0.1485, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 04:29:00] (step=0256400) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 04:29:09] (step=0256500) Train Loss: 0.1475, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-21 04:29:18] (step=0256600) Train Loss: 0.1474, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 04:29:28] (step=0256700) Train Loss: 0.1463, Train Steps/Sec: 10.27, Grad Norm: 0.0061 +[2025-02-21 04:29:36] (step=0256800) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:29:44] (step=0256900) Train Loss: 0.1471, Train Steps/Sec: 12.21, Grad Norm: 0.0062 +[2025-02-21 04:29:53] (step=0257000) Train Loss: 0.1457, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 04:30:01] (step=0257100) Train Loss: 0.1482, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 04:30:09] (step=0257200) Train Loss: 0.1469, Train Steps/Sec: 12.06, Grad Norm: 0.0062 +[2025-02-21 04:30:17] (step=0257300) Train Loss: 0.1477, Train Steps/Sec: 12.51, Grad Norm: 0.0061 +[2025-02-21 04:30:25] (step=0257400) Train Loss: 0.1473, Train Steps/Sec: 12.49, Grad Norm: 0.0067 +[2025-02-21 04:30:33] (step=0257500) Train Loss: 0.1476, Train Steps/Sec: 12.51, Grad Norm: 0.0061 +[2025-02-21 04:30:41] (step=0257600) Train Loss: 0.1481, Train Steps/Sec: 12.51, Grad Norm: 0.0061 +[2025-02-21 04:30:49] (step=0257700) Train Loss: 0.1464, Train Steps/Sec: 12.44, Grad Norm: 0.0061 +[2025-02-21 04:30:59] (step=0257800) Train Loss: 0.1462, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-21 04:31:07] (step=0257900) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 04:31:14] (step=0258000) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 04:31:23] (step=0258100) Train Loss: 0.1483, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 04:31:33] (step=0258200) Train Loss: 0.1471, Train Steps/Sec: 9.71, Grad Norm: 0.0062 +[2025-02-21 04:31:42] (step=0258300) Train Loss: 0.1458, Train Steps/Sec: 11.35, Grad Norm: 0.0064 +[2025-02-21 04:31:50] (step=0258400) Train Loss: 0.1481, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 04:31:58] (step=0258500) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:32:06] (step=0258600) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:32:14] (step=0258700) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:32:23] (step=0258800) Train Loss: 0.1470, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 04:32:32] (step=0258900) Train Loss: 0.1461, Train Steps/Sec: 10.26, Grad Norm: 0.0062 +[2025-02-21 04:32:41] (step=0259000) Train Loss: 0.1465, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-21 04:32:50] (step=0259100) Train Loss: 0.1475, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 04:32:59] (step=0259200) Train Loss: 0.1462, Train Steps/Sec: 11.03, Grad Norm: 0.0061 +[2025-02-21 04:33:08] (step=0259300) Train Loss: 0.1480, Train Steps/Sec: 11.73, Grad Norm: 0.0063 +[2025-02-21 04:33:15] (step=0259400) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:33:23] (step=0259500) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 04:33:31] (step=0259600) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 04:33:39] (step=0259700) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 04:33:47] (step=0259800) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:33:55] (step=0259900) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:34:03] (step=0260000) Train Loss: 0.1483, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 04:34:06] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0260000.pt +[2025-02-21 04:34:14] (step=0260100) Train Loss: 0.1482, Train Steps/Sec: 9.05, Grad Norm: 0.0063 +[2025-02-21 04:34:22] (step=0260200) Train Loss: 0.1477, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 04:34:32] (step=0260300) Train Loss: 0.1463, Train Steps/Sec: 10.23, Grad Norm: 0.0059 +[2025-02-21 04:34:42] (step=0260400) Train Loss: 0.1469, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-21 04:34:51] (step=0260500) Train Loss: 0.1465, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 04:35:00] (step=0260600) Train Loss: 0.1475, Train Steps/Sec: 11.35, Grad Norm: 0.0063 +[2025-02-21 04:35:07] (step=0260700) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:35:15] (step=0260800) Train Loss: 0.1488, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:35:23] (step=0260900) Train Loss: 0.1476, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 04:35:32] (step=0261000) Train Loss: 0.1475, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 04:35:42] (step=0261100) Train Loss: 0.1470, Train Steps/Sec: 10.58, Grad Norm: 0.0063 +[2025-02-21 04:35:50] (step=0261200) Train Loss: 0.1477, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 04:35:59] (step=0261300) Train Loss: 0.1482, Train Steps/Sec: 10.70, Grad Norm: 0.0063 +[2025-02-21 04:36:08] (step=0261400) Train Loss: 0.1466, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 04:36:17] (step=0261500) Train Loss: 0.1487, Train Steps/Sec: 11.21, Grad Norm: 0.0064 +[2025-02-21 04:36:25] (step=0261600) Train Loss: 0.1480, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 04:36:33] (step=0261700) Train Loss: 0.1486, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 04:36:41] (step=0261800) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 04:36:49] (step=0261900) Train Loss: 0.1474, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:36:56] (step=0262000) Train Loss: 0.1475, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 04:37:05] (step=0262100) Train Loss: 0.1473, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 04:37:13] (step=0262200) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 04:37:21] (step=0262300) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 04:37:28] (step=0262400) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:37:37] (step=0262500) Train Loss: 0.1478, Train Steps/Sec: 11.35, Grad Norm: 0.0063 +[2025-02-21 04:37:47] (step=0262600) Train Loss: 0.1476, Train Steps/Sec: 10.28, Grad Norm: 0.0061 +[2025-02-21 04:37:56] (step=0262700) Train Loss: 0.1469, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 04:38:06] (step=0262800) Train Loss: 0.1464, Train Steps/Sec: 10.20, Grad Norm: 0.0062 +[2025-02-21 04:38:14] (step=0262900) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:38:22] (step=0263000) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:38:29] (step=0263100) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 04:38:39] (step=0263200) Train Loss: 0.1484, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 04:38:48] (step=0263300) Train Loss: 0.1487, Train Steps/Sec: 10.61, Grad Norm: 0.0062 +[2025-02-21 04:38:56] (step=0263400) Train Loss: 0.1478, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 04:39:05] (step=0263500) Train Loss: 0.1464, Train Steps/Sec: 11.06, Grad Norm: 0.0063 +[2025-02-21 04:39:14] (step=0263600) Train Loss: 0.1468, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 04:39:22] (step=0263700) Train Loss: 0.1483, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 04:39:30] (step=0263800) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:39:38] (step=0263900) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 04:39:47] (step=0264000) Train Loss: 0.1475, Train Steps/Sec: 11.20, Grad Norm: 0.0062 +[2025-02-21 04:39:55] (step=0264100) Train Loss: 0.1477, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 04:40:02] (step=0264200) Train Loss: 0.1472, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 04:40:11] (step=0264300) Train Loss: 0.1468, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 04:40:19] (step=0264400) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:40:27] (step=0264500) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:40:35] (step=0264600) Train Loss: 0.1486, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:40:44] (step=0264700) Train Loss: 0.1470, Train Steps/Sec: 10.29, Grad Norm: 0.0060 +[2025-02-21 04:40:53] (step=0264800) Train Loss: 0.1464, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 04:41:02] (step=0264900) Train Loss: 0.1474, Train Steps/Sec: 10.95, Grad Norm: 0.0065 +[2025-02-21 04:41:11] (step=0265000) Train Loss: 0.1479, Train Steps/Sec: 11.73, Grad Norm: 0.0064 +[2025-02-21 04:41:19] (step=0265100) Train Loss: 0.1485, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:41:27] (step=0265200) Train Loss: 0.1471, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 04:41:35] (step=0265300) Train Loss: 0.1472, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-21 04:41:45] (step=0265400) Train Loss: 0.1474, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 04:41:54] (step=0265500) Train Loss: 0.1468, Train Steps/Sec: 10.28, Grad Norm: 0.0061 +[2025-02-21 04:42:03] (step=0265600) Train Loss: 0.1470, Train Steps/Sec: 11.06, Grad Norm: 0.0060 +[2025-02-21 04:42:12] (step=0265700) Train Loss: 0.1469, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-21 04:42:20] (step=0265800) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:42:28] (step=0265900) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:42:36] (step=0266000) Train Loss: 0.1481, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:42:44] (step=0266100) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:42:52] (step=0266200) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 04:43:00] (step=0266300) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:43:08] (step=0266400) Train Loss: 0.1486, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 04:43:17] (step=0266500) Train Loss: 0.1473, Train Steps/Sec: 10.87, Grad Norm: 0.0060 +[2025-02-21 04:43:25] (step=0266600) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:43:33] (step=0266700) Train Loss: 0.1495, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:43:41] (step=0266800) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 04:43:51] (step=0266900) Train Loss: 0.1475, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-21 04:43:59] (step=0267000) Train Loss: 0.1462, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 04:44:09] (step=0267100) Train Loss: 0.1478, Train Steps/Sec: 10.63, Grad Norm: 0.0066 +[2025-02-21 04:44:17] (step=0267200) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:44:25] (step=0267300) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 04:44:33] (step=0267400) Train Loss: 0.1482, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 04:44:41] (step=0267500) Train Loss: 0.1462, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 04:44:50] (step=0267600) Train Loss: 0.1488, Train Steps/Sec: 10.66, Grad Norm: 0.0061 +[2025-02-21 04:44:59] (step=0267700) Train Loss: 0.1473, Train Steps/Sec: 10.59, Grad Norm: 0.0061 +[2025-02-21 04:45:10] (step=0267800) Train Loss: 0.1470, Train Steps/Sec: 9.71, Grad Norm: 0.0061 +[2025-02-21 04:45:18] (step=0267900) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 04:45:26] (step=0268000) Train Loss: 0.1469, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 04:45:34] (step=0268100) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:45:42] (step=0268200) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:45:49] (step=0268300) Train Loss: 0.1484, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 04:45:57] (step=0268400) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:46:05] (step=0268500) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 04:46:13] (step=0268600) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:46:21] (step=0268700) Train Loss: 0.1481, Train Steps/Sec: 12.19, Grad Norm: 0.0064 +[2025-02-21 04:46:29] (step=0268800) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 04:46:37] (step=0268900) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 04:46:46] (step=0269000) Train Loss: 0.1476, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-21 04:46:57] (step=0269100) Train Loss: 0.1469, Train Steps/Sec: 9.43, Grad Norm: 0.0060 +[2025-02-21 04:47:05] (step=0269200) Train Loss: 0.1480, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 04:47:14] (step=0269300) Train Loss: 0.1473, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 04:47:22] (step=0269400) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:47:30] (step=0269500) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:47:38] (step=0269600) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:47:46] (step=0269700) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:47:56] (step=0269800) Train Loss: 0.1459, Train Steps/Sec: 10.69, Grad Norm: 0.0062 +[2025-02-21 04:48:06] (step=0269900) Train Loss: 0.1470, Train Steps/Sec: 9.76, Grad Norm: 0.0061 +[2025-02-21 04:48:14] (step=0270000) Train Loss: 0.1473, Train Steps/Sec: 11.73, Grad Norm: 0.0065 +[2025-02-21 04:48:22] (step=0270100) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:48:30] (step=0270200) Train Loss: 0.1472, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:48:39] (step=0270300) Train Loss: 0.1465, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-21 04:48:47] (step=0270400) Train Loss: 0.1474, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 04:48:55] (step=0270500) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:49:03] (step=0270600) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 04:49:11] (step=0270700) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 04:49:19] (step=0270800) Train Loss: 0.1470, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 04:49:27] (step=0270900) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 04:49:35] (step=0271000) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:49:43] (step=0271100) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 04:49:51] (step=0271200) Train Loss: 0.1466, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 04:50:02] (step=0271300) Train Loss: 0.1480, Train Steps/Sec: 9.72, Grad Norm: 0.0059 +[2025-02-21 04:50:10] (step=0271400) Train Loss: 0.1477, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 04:50:20] (step=0271500) Train Loss: 0.1471, Train Steps/Sec: 9.91, Grad Norm: 0.0062 +[2025-02-21 04:50:28] (step=0271600) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:50:36] (step=0271700) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 04:50:44] (step=0271800) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 04:50:53] (step=0271900) Train Loss: 0.1482, Train Steps/Sec: 11.35, Grad Norm: 0.0064 +[2025-02-21 04:51:03] (step=0272000) Train Loss: 0.1469, Train Steps/Sec: 10.09, Grad Norm: 0.0059 +[2025-02-21 04:51:12] (step=0272100) Train Loss: 0.1472, Train Steps/Sec: 10.58, Grad Norm: 0.0064 +[2025-02-21 04:51:20] (step=0272200) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 04:51:28] (step=0272300) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:51:36] (step=0272400) Train Loss: 0.1491, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 04:51:44] (step=0272500) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 04:51:52] (step=0272600) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 04:52:00] (step=0272700) Train Loss: 0.1481, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 04:52:09] (step=0272800) Train Loss: 0.1470, Train Steps/Sec: 11.24, Grad Norm: 0.0063 +[2025-02-21 04:52:17] (step=0272900) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 04:52:25] (step=0273000) Train Loss: 0.1477, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 04:52:33] (step=0273100) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:52:41] (step=0273200) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:52:49] (step=0273300) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:52:57] (step=0273400) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 04:53:07] (step=0273500) Train Loss: 0.1471, Train Steps/Sec: 9.74, Grad Norm: 0.0062 +[2025-02-21 04:53:16] (step=0273600) Train Loss: 0.1478, Train Steps/Sec: 11.70, Grad Norm: 0.0064 +[2025-02-21 04:53:25] (step=0273700) Train Loss: 0.1486, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 04:53:33] (step=0273800) Train Loss: 0.1476, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 04:53:41] (step=0273900) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 04:53:50] (step=0274000) Train Loss: 0.1467, Train Steps/Sec: 10.49, Grad Norm: 0.0060 +[2025-02-21 04:53:59] (step=0274100) Train Loss: 0.1491, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 04:54:09] (step=0274200) Train Loss: 0.1470, Train Steps/Sec: 9.47, Grad Norm: 0.0062 +[2025-02-21 04:54:18] (step=0274300) Train Loss: 0.1470, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 04:54:26] (step=0274400) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 04:54:34] (step=0274500) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 04:54:42] (step=0274600) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 04:54:50] (step=0274700) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:54:58] (step=0274800) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:55:06] (step=0274900) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 04:55:13] (step=0275000) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 04:55:21] (step=0275100) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:55:30] (step=0275200) Train Loss: 0.1479, Train Steps/Sec: 12.02, Grad Norm: 0.0062 +[2025-02-21 04:55:39] (step=0275300) Train Loss: 0.1472, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-21 04:55:47] (step=0275400) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 04:55:54] (step=0275500) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 04:56:04] (step=0275600) Train Loss: 0.1472, Train Steps/Sec: 11.00, Grad Norm: 0.0063 +[2025-02-21 04:56:13] (step=0275700) Train Loss: 0.1485, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 04:56:22] (step=0275800) Train Loss: 0.1477, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 04:56:31] (step=0275900) Train Loss: 0.1461, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 04:56:38] (step=0276000) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 04:56:46] (step=0276100) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 04:56:55] (step=0276200) Train Loss: 0.1470, Train Steps/Sec: 11.77, Grad Norm: 0.0063 +[2025-02-21 04:57:04] (step=0276300) Train Loss: 0.1475, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 04:57:14] (step=0276400) Train Loss: 0.1463, Train Steps/Sec: 10.31, Grad Norm: 0.0059 +[2025-02-21 04:57:23] (step=0276500) Train Loss: 0.1468, Train Steps/Sec: 10.45, Grad Norm: 0.0060 +[2025-02-21 04:57:31] (step=0276600) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 04:57:39] (step=0276700) Train Loss: 0.1487, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 04:57:47] (step=0276800) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 04:57:55] (step=0276900) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 04:58:03] (step=0277000) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 04:58:11] (step=0277100) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 04:58:19] (step=0277200) Train Loss: 0.1461, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 04:58:27] (step=0277300) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 04:58:35] (step=0277400) Train Loss: 0.1481, Train Steps/Sec: 12.08, Grad Norm: 0.0062 +[2025-02-21 04:58:43] (step=0277500) Train Loss: 0.1469, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 04:58:51] (step=0277600) Train Loss: 0.1484, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 04:58:59] (step=0277700) Train Loss: 0.1464, Train Steps/Sec: 12.49, Grad Norm: 0.0063 +[2025-02-21 04:59:09] (step=0277800) Train Loss: 0.1477, Train Steps/Sec: 9.67, Grad Norm: 0.0062 +[2025-02-21 04:59:19] (step=0277900) Train Loss: 0.1484, Train Steps/Sec: 10.62, Grad Norm: 0.0062 +[2025-02-21 04:59:27] (step=0278000) Train Loss: 0.1471, Train Steps/Sec: 11.73, Grad Norm: 0.0064 +[2025-02-21 04:59:36] (step=0278100) Train Loss: 0.1470, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-21 04:59:44] (step=0278200) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 04:59:53] (step=0278300) Train Loss: 0.1460, Train Steps/Sec: 11.78, Grad Norm: 0.0063 +[2025-02-21 05:00:01] (step=0278400) Train Loss: 0.1472, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 05:00:10] (step=0278500) Train Loss: 0.1468, Train Steps/Sec: 10.65, Grad Norm: 0.0063 +[2025-02-21 05:00:20] (step=0278600) Train Loss: 0.1483, Train Steps/Sec: 10.31, Grad Norm: 0.0064 +[2025-02-21 05:00:28] (step=0278700) Train Loss: 0.1472, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 05:00:36] (step=0278800) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 05:00:44] (step=0278900) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 05:00:53] (step=0279000) Train Loss: 0.1469, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-21 05:01:01] (step=0279100) Train Loss: 0.1476, Train Steps/Sec: 12.49, Grad Norm: 0.0061 +[2025-02-21 05:01:09] (step=0279200) Train Loss: 0.1477, Train Steps/Sec: 12.51, Grad Norm: 0.0062 +[2025-02-21 05:01:17] (step=0279300) Train Loss: 0.1474, Train Steps/Sec: 12.53, Grad Norm: 0.0059 +[2025-02-21 05:01:25] (step=0279400) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:01:33] (step=0279500) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:01:41] (step=0279600) Train Loss: 0.1462, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 05:01:49] (step=0279700) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 05:01:57] (step=0279800) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 05:02:05] (step=0279900) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 05:02:14] (step=0280000) Train Loss: 0.1468, Train Steps/Sec: 10.67, Grad Norm: 0.0061 +[2025-02-21 05:02:17] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0280000.pt +[2025-02-21 05:02:26] (step=0280100) Train Loss: 0.1466, Train Steps/Sec: 8.52, Grad Norm: 0.0060 +[2025-02-21 05:02:35] (step=0280200) Train Loss: 0.1475, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 05:02:44] (step=0280300) Train Loss: 0.1478, Train Steps/Sec: 10.57, Grad Norm: 0.0059 +[2025-02-21 05:02:53] (step=0280400) Train Loss: 0.1461, Train Steps/Sec: 12.10, Grad Norm: 0.0062 +[2025-02-21 05:03:01] (step=0280500) Train Loss: 0.1474, Train Steps/Sec: 11.78, Grad Norm: 0.0061 +[2025-02-21 05:03:10] (step=0280600) Train Loss: 0.1472, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 05:03:18] (step=0280700) Train Loss: 0.1477, Train Steps/Sec: 12.08, Grad Norm: 0.0062 +[2025-02-21 05:03:29] (step=0280800) Train Loss: 0.1475, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-21 05:03:37] (step=0280900) Train Loss: 0.1472, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 05:03:44] (step=0281000) Train Loss: 0.1478, Train Steps/Sec: 12.51, Grad Norm: 0.0064 +[2025-02-21 05:03:52] (step=0281100) Train Loss: 0.1463, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 05:04:00] (step=0281200) Train Loss: 0.1476, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 05:04:08] (step=0281300) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 05:04:16] (step=0281400) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 05:04:25] (step=0281500) Train Loss: 0.1477, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 05:04:33] (step=0281600) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:04:41] (step=0281700) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:04:49] (step=0281800) Train Loss: 0.1470, Train Steps/Sec: 12.04, Grad Norm: 0.0060 +[2025-02-21 05:04:57] (step=0281900) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:05:05] (step=0282000) Train Loss: 0.1490, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:05:13] (step=0282100) Train Loss: 0.1481, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 05:05:23] (step=0282200) Train Loss: 0.1463, Train Steps/Sec: 10.39, Grad Norm: 0.0061 +[2025-02-21 05:05:32] (step=0282300) Train Loss: 0.1464, Train Steps/Sec: 10.98, Grad Norm: 0.0065 +[2025-02-21 05:05:41] (step=0282400) Train Loss: 0.1474, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-21 05:05:49] (step=0282500) Train Loss: 0.1473, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 05:05:58] (step=0282600) Train Loss: 0.1479, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 05:06:07] (step=0282700) Train Loss: 0.1480, Train Steps/Sec: 11.39, Grad Norm: 0.0064 +[2025-02-21 05:06:16] (step=0282800) Train Loss: 0.1472, Train Steps/Sec: 10.23, Grad Norm: 0.0062 +[2025-02-21 05:06:25] (step=0282900) Train Loss: 0.1468, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 05:06:35] (step=0283000) Train Loss: 0.1479, Train Steps/Sec: 10.29, Grad Norm: 0.0060 +[2025-02-21 05:06:43] (step=0283100) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:06:51] (step=0283200) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:06:58] (step=0283300) Train Loss: 0.1483, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:07:06] (step=0283400) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:07:14] (step=0283500) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 05:07:22] (step=0283600) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 05:07:30] (step=0283700) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:07:38] (step=0283800) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 05:07:46] (step=0283900) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:07:55] (step=0284000) Train Loss: 0.1472, Train Steps/Sec: 10.85, Grad Norm: 0.0063 +[2025-02-21 05:08:03] (step=0284100) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 05:08:11] (step=0284200) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 05:08:19] (step=0284300) Train Loss: 0.1458, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 05:08:29] (step=0284400) Train Loss: 0.1466, Train Steps/Sec: 10.36, Grad Norm: 0.0062 +[2025-02-21 05:08:38] (step=0284500) Train Loss: 0.1469, Train Steps/Sec: 10.99, Grad Norm: 0.0063 +[2025-02-21 05:08:47] (step=0284600) Train Loss: 0.1462, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 05:08:56] (step=0284700) Train Loss: 0.1466, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 05:09:04] (step=0284800) Train Loss: 0.1476, Train Steps/Sec: 11.81, Grad Norm: 0.0060 +[2025-02-21 05:09:13] (step=0284900) Train Loss: 0.1466, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 05:09:21] (step=0285000) Train Loss: 0.1481, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 05:09:29] (step=0285100) Train Loss: 0.1482, Train Steps/Sec: 12.03, Grad Norm: 0.0062 +[2025-02-21 05:09:39] (step=0285200) Train Loss: 0.1467, Train Steps/Sec: 9.92, Grad Norm: 0.0060 +[2025-02-21 05:09:48] (step=0285300) Train Loss: 0.1462, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-21 05:09:56] (step=0285400) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:10:04] (step=0285500) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:10:12] (step=0285600) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 05:10:20] (step=0285700) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 05:10:28] (step=0285800) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 05:10:36] (step=0285900) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 05:10:44] (step=0286000) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 05:10:52] (step=0286100) Train Loss: 0.1472, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 05:11:00] (step=0286200) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 05:11:08] (step=0286300) Train Loss: 0.1488, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:11:16] (step=0286400) Train Loss: 0.1467, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 05:11:25] (step=0286500) Train Loss: 0.1465, Train Steps/Sec: 10.85, Grad Norm: 0.0060 +[2025-02-21 05:11:36] (step=0286600) Train Loss: 0.1480, Train Steps/Sec: 9.78, Grad Norm: 0.0060 +[2025-02-21 05:11:44] (step=0286700) Train Loss: 0.1480, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 05:11:53] (step=0286800) Train Loss: 0.1485, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 05:12:02] (step=0286900) Train Loss: 0.1482, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 05:12:11] (step=0287000) Train Loss: 0.1470, Train Steps/Sec: 11.04, Grad Norm: 0.0061 +[2025-02-21 05:12:19] (step=0287100) Train Loss: 0.1488, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 05:12:27] (step=0287200) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:12:35] (step=0287300) Train Loss: 0.1484, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 05:12:45] (step=0287400) Train Loss: 0.1459, Train Steps/Sec: 10.30, Grad Norm: 0.0062 +[2025-02-21 05:12:53] (step=0287500) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 05:13:01] (step=0287600) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 05:13:09] (step=0287700) Train Loss: 0.1466, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 05:13:18] (step=0287800) Train Loss: 0.1475, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-21 05:13:26] (step=0287900) Train Loss: 0.1461, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 05:13:34] (step=0288000) Train Loss: 0.1484, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 05:13:42] (step=0288100) Train Loss: 0.1472, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 05:13:50] (step=0288200) Train Loss: 0.1459, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 05:13:58] (step=0288300) Train Loss: 0.1485, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 05:14:06] (step=0288400) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 05:14:14] (step=0288500) Train Loss: 0.1471, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 05:14:22] (step=0288600) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 05:14:30] (step=0288700) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 05:14:40] (step=0288800) Train Loss: 0.1467, Train Steps/Sec: 10.34, Grad Norm: 0.0059 +[2025-02-21 05:14:49] (step=0288900) Train Loss: 0.1467, Train Steps/Sec: 11.37, Grad Norm: 0.0063 +[2025-02-21 05:14:59] (step=0289000) Train Loss: 0.1474, Train Steps/Sec: 9.68, Grad Norm: 0.0061 +[2025-02-21 05:15:08] (step=0289100) Train Loss: 0.1474, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 05:15:17] (step=0289200) Train Loss: 0.1463, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 05:15:24] (step=0289300) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:15:32] (step=0289400) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 05:15:41] (step=0289500) Train Loss: 0.1464, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 05:15:51] (step=0289600) Train Loss: 0.1467, Train Steps/Sec: 10.90, Grad Norm: 0.0059 +[2025-02-21 05:15:59] (step=0289700) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:16:07] (step=0289800) Train Loss: 0.1484, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 05:16:14] (step=0289900) Train Loss: 0.1473, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 05:16:22] (step=0290000) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 05:16:30] (step=0290100) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:16:38] (step=0290200) Train Loss: 0.1471, Train Steps/Sec: 12.51, Grad Norm: 0.0061 +[2025-02-21 05:16:47] (step=0290300) Train Loss: 0.1463, Train Steps/Sec: 11.20, Grad Norm: 0.0059 +[2025-02-21 05:16:55] (step=0290400) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:17:03] (step=0290500) Train Loss: 0.1481, Train Steps/Sec: 12.13, Grad Norm: 0.0064 +[2025-02-21 05:17:11] (step=0290600) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 05:17:19] (step=0290700) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:17:27] (step=0290800) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:17:36] (step=0290900) Train Loss: 0.1474, Train Steps/Sec: 11.39, Grad Norm: 0.0061 +[2025-02-21 05:17:45] (step=0291000) Train Loss: 0.1462, Train Steps/Sec: 10.73, Grad Norm: 0.0059 +[2025-02-21 05:17:54] (step=0291100) Train Loss: 0.1464, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 05:18:04] (step=0291200) Train Loss: 0.1481, Train Steps/Sec: 10.35, Grad Norm: 0.0062 +[2025-02-21 05:18:13] (step=0291300) Train Loss: 0.1471, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 05:18:21] (step=0291400) Train Loss: 0.1460, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 05:18:30] (step=0291500) Train Loss: 0.1487, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 05:18:38] (step=0291600) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 05:18:47] (step=0291700) Train Loss: 0.1474, Train Steps/Sec: 10.62, Grad Norm: 0.0060 +[2025-02-21 05:18:56] (step=0291800) Train Loss: 0.1466, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 05:19:04] (step=0291900) Train Loss: 0.1492, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 05:19:12] (step=0292000) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:19:20] (step=0292100) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 05:19:28] (step=0292200) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:19:35] (step=0292300) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 05:19:43] (step=0292400) Train Loss: 0.1468, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 05:19:51] (step=0292500) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 05:19:59] (step=0292600) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:20:08] (step=0292700) Train Loss: 0.1468, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 05:20:16] (step=0292800) Train Loss: 0.1463, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 05:20:24] (step=0292900) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 05:20:32] (step=0293000) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 05:20:42] (step=0293100) Train Loss: 0.1475, Train Steps/Sec: 10.68, Grad Norm: 0.0061 +[2025-02-21 05:20:51] (step=0293200) Train Loss: 0.1459, Train Steps/Sec: 11.01, Grad Norm: 0.0060 +[2025-02-21 05:21:00] (step=0293300) Train Loss: 0.1462, Train Steps/Sec: 10.69, Grad Norm: 0.0060 +[2025-02-21 05:21:10] (step=0293400) Train Loss: 0.1465, Train Steps/Sec: 10.39, Grad Norm: 0.0060 +[2025-02-21 05:21:18] (step=0293500) Train Loss: 0.1481, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 05:21:26] (step=0293600) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 05:21:34] (step=0293700) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:21:42] (step=0293800) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:21:51] (step=0293900) Train Loss: 0.1472, Train Steps/Sec: 11.00, Grad Norm: 0.0065 +[2025-02-21 05:22:01] (step=0294000) Train Loss: 0.1467, Train Steps/Sec: 9.87, Grad Norm: 0.0061 +[2025-02-21 05:22:09] (step=0294100) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 05:22:17] (step=0294200) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 05:22:25] (step=0294300) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:22:33] (step=0294400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 05:22:41] (step=0294500) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:22:49] (step=0294600) Train Loss: 0.1489, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:22:57] (step=0294700) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 05:23:05] (step=0294800) Train Loss: 0.1464, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 05:23:13] (step=0294900) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:23:21] (step=0295000) Train Loss: 0.1488, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 05:23:29] (step=0295100) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:23:36] (step=0295200) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:23:47] (step=0295300) Train Loss: 0.1468, Train Steps/Sec: 9.11, Grad Norm: 0.0058 +[2025-02-21 05:23:57] (step=0295400) Train Loss: 0.1469, Train Steps/Sec: 10.66, Grad Norm: 0.0059 +[2025-02-21 05:24:07] (step=0295500) Train Loss: 0.1477, Train Steps/Sec: 10.36, Grad Norm: 0.0060 +[2025-02-21 05:24:15] (step=0295600) Train Loss: 0.1484, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 05:24:23] (step=0295700) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 05:24:31] (step=0295800) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:24:39] (step=0295900) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 05:24:47] (step=0296000) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:24:56] (step=0296100) Train Loss: 0.1467, Train Steps/Sec: 10.69, Grad Norm: 0.0063 +[2025-02-21 05:25:05] (step=0296200) Train Loss: 0.1474, Train Steps/Sec: 11.31, Grad Norm: 0.0063 +[2025-02-21 05:25:13] (step=0296300) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:25:21] (step=0296400) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 05:25:30] (step=0296500) Train Loss: 0.1481, Train Steps/Sec: 11.21, Grad Norm: 0.0064 +[2025-02-21 05:25:38] (step=0296600) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:25:46] (step=0296700) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:25:54] (step=0296800) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:26:02] (step=0296900) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:26:10] (step=0297000) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 05:26:18] (step=0297100) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:26:26] (step=0297200) Train Loss: 0.1472, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 05:26:34] (step=0297300) Train Loss: 0.1466, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 05:26:42] (step=0297400) Train Loss: 0.1470, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 05:26:52] (step=0297500) Train Loss: 0.1470, Train Steps/Sec: 10.35, Grad Norm: 0.0061 +[2025-02-21 05:27:01] (step=0297600) Train Loss: 0.1472, Train Steps/Sec: 10.40, Grad Norm: 0.0062 +[2025-02-21 05:27:11] (step=0297700) Train Loss: 0.1470, Train Steps/Sec: 10.35, Grad Norm: 0.0061 +[2025-02-21 05:27:20] (step=0297800) Train Loss: 0.1470, Train Steps/Sec: 10.61, Grad Norm: 0.0060 +[2025-02-21 05:27:28] (step=0297900) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 05:27:36] (step=0298000) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 05:27:44] (step=0298100) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:27:52] (step=0298200) Train Loss: 0.1482, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:28:01] (step=0298300) Train Loss: 0.1474, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-21 05:28:10] (step=0298400) Train Loss: 0.1471, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-21 05:28:18] (step=0298500) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 05:28:26] (step=0298600) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 05:28:34] (step=0298700) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 05:28:42] (step=0298800) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 05:28:50] (step=0298900) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 05:28:59] (step=0299000) Train Loss: 0.1462, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 05:29:07] (step=0299100) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:29:15] (step=0299200) Train Loss: 0.1480, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 05:29:23] (step=0299300) Train Loss: 0.1482, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 05:29:31] (step=0299400) Train Loss: 0.1475, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:29:39] (step=0299500) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 05:29:47] (step=0299600) Train Loss: 0.1475, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 05:29:58] (step=0299700) Train Loss: 0.1483, Train Steps/Sec: 9.59, Grad Norm: 0.0059 +[2025-02-21 05:30:08] (step=0299800) Train Loss: 0.1466, Train Steps/Sec: 10.13, Grad Norm: 0.0061 +[2025-02-21 05:30:17] (step=0299900) Train Loss: 0.1467, Train Steps/Sec: 10.98, Grad Norm: 0.0064 +[2025-02-21 05:30:25] (step=0300000) Train Loss: 0.1478, Train Steps/Sec: 12.19, Grad Norm: 0.0063 +[2025-02-21 05:30:29] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0300000.pt +[2025-02-21 05:30:37] (step=0300100) Train Loss: 0.1467, Train Steps/Sec: 8.46, Grad Norm: 0.0060 +[2025-02-21 05:30:45] (step=0300200) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 05:30:54] (step=0300300) Train Loss: 0.1472, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-21 05:31:02] (step=0300400) Train Loss: 0.1480, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 05:31:11] (step=0300500) Train Loss: 0.1470, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 05:31:20] (step=0300600) Train Loss: 0.1471, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 05:31:28] (step=0300700) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:31:36] (step=0300800) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 05:31:44] (step=0300900) Train Loss: 0.1476, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 05:31:52] (step=0301000) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 05:31:59] (step=0301100) Train Loss: 0.1468, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:32:07] (step=0301200) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:32:15] (step=0301300) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 05:32:24] (step=0301400) Train Loss: 0.1480, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 05:32:32] (step=0301500) Train Loss: 0.1476, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 05:32:40] (step=0301600) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 05:32:49] (step=0301700) Train Loss: 0.1456, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 05:32:57] (step=0301800) Train Loss: 0.1466, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 05:33:08] (step=0301900) Train Loss: 0.1467, Train Steps/Sec: 9.07, Grad Norm: 0.0062 +[2025-02-21 05:33:17] (step=0302000) Train Loss: 0.1458, Train Steps/Sec: 11.41, Grad Norm: 0.0062 +[2025-02-21 05:33:26] (step=0302100) Train Loss: 0.1477, Train Steps/Sec: 10.98, Grad Norm: 0.0062 +[2025-02-21 05:33:34] (step=0302200) Train Loss: 0.1484, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 05:33:42] (step=0302300) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:33:50] (step=0302400) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:33:58] (step=0302500) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 05:34:06] (step=0302600) Train Loss: 0.1469, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 05:34:16] (step=0302700) Train Loss: 0.1487, Train Steps/Sec: 10.81, Grad Norm: 0.0063 +[2025-02-21 05:34:25] (step=0302800) Train Loss: 0.1479, Train Steps/Sec: 10.10, Grad Norm: 0.0059 +[2025-02-21 05:34:33] (step=0302900) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:34:41] (step=0303000) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 05:34:49] (step=0303100) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 05:34:57] (step=0303200) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:35:05] (step=0303300) Train Loss: 0.1477, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 05:35:13] (step=0303400) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:35:21] (step=0303500) Train Loss: 0.1480, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 05:35:29] (step=0303600) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 05:35:37] (step=0303700) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:35:45] (step=0303800) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 05:35:53] (step=0303900) Train Loss: 0.1454, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 05:36:04] (step=0304000) Train Loss: 0.1471, Train Steps/Sec: 9.58, Grad Norm: 0.0062 +[2025-02-21 05:36:14] (step=0304100) Train Loss: 0.1471, Train Steps/Sec: 9.40, Grad Norm: 0.0060 +[2025-02-21 05:36:23] (step=0304200) Train Loss: 0.1467, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 05:36:32] (step=0304300) Train Loss: 0.1486, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 05:36:40] (step=0304400) Train Loss: 0.1469, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 05:36:48] (step=0304500) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 05:36:56] (step=0304600) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 05:37:04] (step=0304700) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:37:12] (step=0304800) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 05:37:22] (step=0304900) Train Loss: 0.1468, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-21 05:37:30] (step=0305000) Train Loss: 0.1469, Train Steps/Sec: 11.70, Grad Norm: 0.0058 +[2025-02-21 05:37:38] (step=0305100) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 05:37:46] (step=0305200) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 05:37:55] (step=0305300) Train Loss: 0.1469, Train Steps/Sec: 11.08, Grad Norm: 0.0061 +[2025-02-21 05:38:03] (step=0305400) Train Loss: 0.1482, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 05:38:11] (step=0305500) Train Loss: 0.1483, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 05:38:19] (step=0305600) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:38:27] (step=0305700) Train Loss: 0.1479, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 05:38:35] (step=0305800) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 05:38:43] (step=0305900) Train Loss: 0.1463, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 05:38:52] (step=0306000) Train Loss: 0.1479, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 05:39:00] (step=0306100) Train Loss: 0.1457, Train Steps/Sec: 11.81, Grad Norm: 0.0062 +[2025-02-21 05:39:10] (step=0306200) Train Loss: 0.1467, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-21 05:39:20] (step=0306300) Train Loss: 0.1484, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-21 05:39:28] (step=0306400) Train Loss: 0.1478, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 05:39:39] (step=0306500) Train Loss: 0.1464, Train Steps/Sec: 9.89, Grad Norm: 0.0060 +[2025-02-21 05:39:47] (step=0306600) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 05:39:54] (step=0306700) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 05:40:02] (step=0306800) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 05:40:10] (step=0306900) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 05:40:19] (step=0307000) Train Loss: 0.1465, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 05:40:28] (step=0307100) Train Loss: 0.1470, Train Steps/Sec: 10.27, Grad Norm: 0.0060 +[2025-02-21 05:40:37] (step=0307200) Train Loss: 0.1465, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 05:40:45] (step=0307300) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 05:40:52] (step=0307400) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 05:41:00] (step=0307500) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 05:41:08] (step=0307600) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 05:41:16] (step=0307700) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 05:41:25] (step=0307800) Train Loss: 0.1463, Train Steps/Sec: 11.10, Grad Norm: 0.0060 +[2025-02-21 05:41:34] (step=0307900) Train Loss: 0.1465, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 05:41:42] (step=0308000) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 05:41:50] (step=0308100) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 05:41:58] (step=0308200) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:42:07] (step=0308300) Train Loss: 0.1467, Train Steps/Sec: 11.08, Grad Norm: 0.0062 +[2025-02-21 05:42:16] (step=0308400) Train Loss: 0.1474, Train Steps/Sec: 10.37, Grad Norm: 0.0061 +[2025-02-21 05:42:26] (step=0308500) Train Loss: 0.1456, Train Steps/Sec: 10.39, Grad Norm: 0.0060 +[2025-02-21 05:42:35] (step=0308600) Train Loss: 0.1474, Train Steps/Sec: 11.03, Grad Norm: 0.0060 +[2025-02-21 05:42:44] (step=0308700) Train Loss: 0.1474, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 05:42:51] (step=0308800) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:42:59] (step=0308900) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:43:08] (step=0309000) Train Loss: 0.1461, Train Steps/Sec: 11.18, Grad Norm: 0.0062 +[2025-02-21 05:43:16] (step=0309100) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 05:43:24] (step=0309200) Train Loss: 0.1462, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 05:43:34] (step=0309300) Train Loss: 0.1463, Train Steps/Sec: 10.28, Grad Norm: 0.0058 +[2025-02-21 05:43:42] (step=0309400) Train Loss: 0.1468, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 05:43:50] (step=0309500) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 05:43:58] (step=0309600) Train Loss: 0.1481, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 05:44:06] (step=0309700) Train Loss: 0.1468, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:44:14] (step=0309800) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:44:22] (step=0309900) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:44:30] (step=0310000) Train Loss: 0.1475, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 05:44:38] (step=0310100) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 05:44:46] (step=0310200) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 05:44:55] (step=0310300) Train Loss: 0.1462, Train Steps/Sec: 10.84, Grad Norm: 0.0058 +[2025-02-21 05:45:04] (step=0310400) Train Loss: 0.1481, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 05:45:13] (step=0310500) Train Loss: 0.1486, Train Steps/Sec: 11.03, Grad Norm: 0.0060 +[2025-02-21 05:45:23] (step=0310600) Train Loss: 0.1475, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-21 05:45:32] (step=0310700) Train Loss: 0.1464, Train Steps/Sec: 10.67, Grad Norm: 0.0058 +[2025-02-21 05:45:41] (step=0310800) Train Loss: 0.1480, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 05:45:49] (step=0310900) Train Loss: 0.1480, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 05:45:57] (step=0311000) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:46:05] (step=0311100) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 05:46:13] (step=0311200) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 05:46:21] (step=0311300) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 05:46:30] (step=0311400) Train Loss: 0.1467, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 05:46:40] (step=0311500) Train Loss: 0.1464, Train Steps/Sec: 9.63, Grad Norm: 0.0060 +[2025-02-21 05:46:48] (step=0311600) Train Loss: 0.1471, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 05:46:56] (step=0311700) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:47:04] (step=0311800) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 05:47:12] (step=0311900) Train Loss: 0.1480, Train Steps/Sec: 12.65, Grad Norm: 0.0059 +[2025-02-21 05:47:20] (step=0312000) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:47:28] (step=0312100) Train Loss: 0.1477, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 05:47:36] (step=0312200) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 05:47:44] (step=0312300) Train Loss: 0.1481, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 05:47:52] (step=0312400) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:48:01] (step=0312500) Train Loss: 0.1467, Train Steps/Sec: 11.41, Grad Norm: 0.0060 +[2025-02-21 05:48:09] (step=0312600) Train Loss: 0.1458, Train Steps/Sec: 11.40, Grad Norm: 0.0062 +[2025-02-21 05:48:18] (step=0312700) Train Loss: 0.1466, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 05:48:29] (step=0312800) Train Loss: 0.1471, Train Steps/Sec: 8.89, Grad Norm: 0.0061 +[2025-02-21 05:48:38] (step=0312900) Train Loss: 0.1468, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 05:48:47] (step=0313000) Train Loss: 0.1463, Train Steps/Sec: 11.32, Grad Norm: 0.0061 +[2025-02-21 05:48:55] (step=0313100) Train Loss: 0.1453, Train Steps/Sec: 12.21, Grad Norm: 0.0061 +[2025-02-21 05:49:03] (step=0313200) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:49:11] (step=0313300) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:49:19] (step=0313400) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:49:27] (step=0313500) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:49:35] (step=0313600) Train Loss: 0.1479, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 05:49:45] (step=0313700) Train Loss: 0.1477, Train Steps/Sec: 10.97, Grad Norm: 0.0062 +[2025-02-21 05:49:53] (step=0313800) Train Loss: 0.1471, Train Steps/Sec: 12.10, Grad Norm: 0.0063 +[2025-02-21 05:50:01] (step=0313900) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 05:50:09] (step=0314000) Train Loss: 0.1473, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 05:50:18] (step=0314100) Train Loss: 0.1470, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-21 05:50:26] (step=0314200) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:50:34] (step=0314300) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 05:50:41] (step=0314400) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 05:50:50] (step=0314500) Train Loss: 0.1475, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 05:50:58] (step=0314600) Train Loss: 0.1479, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 05:51:07] (step=0314700) Train Loss: 0.1480, Train Steps/Sec: 11.44, Grad Norm: 0.0064 +[2025-02-21 05:51:15] (step=0314800) Train Loss: 0.1479, Train Steps/Sec: 11.43, Grad Norm: 0.0060 +[2025-02-21 05:51:24] (step=0314900) Train Loss: 0.1457, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 05:51:34] (step=0315000) Train Loss: 0.1475, Train Steps/Sec: 10.07, Grad Norm: 0.0063 +[2025-02-21 05:51:43] (step=0315100) Train Loss: 0.1471, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 05:51:51] (step=0315200) Train Loss: 0.1471, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 05:52:01] (step=0315300) Train Loss: 0.1474, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-21 05:52:09] (step=0315400) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 05:52:17] (step=0315500) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 05:52:24] (step=0315600) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 05:52:32] (step=0315700) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 05:52:41] (step=0315800) Train Loss: 0.1469, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 05:52:51] (step=0315900) Train Loss: 0.1472, Train Steps/Sec: 10.60, Grad Norm: 0.0062 +[2025-02-21 05:52:59] (step=0316000) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 05:53:07] (step=0316100) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0066 +[2025-02-21 05:53:14] (step=0316200) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 05:53:22] (step=0316300) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 05:53:30] (step=0316400) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:53:38] (step=0316500) Train Loss: 0.1480, Train Steps/Sec: 12.55, Grad Norm: 0.0064 +[2025-02-21 05:53:47] (step=0316600) Train Loss: 0.1483, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 05:53:56] (step=0316700) Train Loss: 0.1466, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-21 05:54:04] (step=0316800) Train Loss: 0.1465, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 05:54:13] (step=0316900) Train Loss: 0.1463, Train Steps/Sec: 11.02, Grad Norm: 0.0059 +[2025-02-21 05:54:22] (step=0317000) Train Loss: 0.1477, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 05:54:30] (step=0317100) Train Loss: 0.1476, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 05:54:40] (step=0317200) Train Loss: 0.1476, Train Steps/Sec: 10.06, Grad Norm: 0.0059 +[2025-02-21 05:54:49] (step=0317300) Train Loss: 0.1477, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-21 05:54:57] (step=0317400) Train Loss: 0.1462, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-21 05:55:05] (step=0317500) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 05:55:13] (step=0317600) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 05:55:21] (step=0317700) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 05:55:30] (step=0317800) Train Loss: 0.1464, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-21 05:55:38] (step=0317900) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 05:55:47] (step=0318000) Train Loss: 0.1456, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 05:55:56] (step=0318100) Train Loss: 0.1471, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 05:56:04] (step=0318200) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:56:12] (step=0318300) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0066 +[2025-02-21 05:56:20] (step=0318400) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 05:56:28] (step=0318500) Train Loss: 0.1478, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 05:56:36] (step=0318600) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 05:56:44] (step=0318700) Train Loss: 0.1456, Train Steps/Sec: 12.66, Grad Norm: 0.0061 +[2025-02-21 05:56:52] (step=0318800) Train Loss: 0.1480, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 05:57:01] (step=0318900) Train Loss: 0.1462, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 05:57:09] (step=0319000) Train Loss: 0.1465, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 05:57:19] (step=0319100) Train Loss: 0.1466, Train Steps/Sec: 10.27, Grad Norm: 0.0061 +[2025-02-21 05:57:27] (step=0319200) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 05:57:35] (step=0319300) Train Loss: 0.1475, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 05:57:45] (step=0319400) Train Loss: 0.1474, Train Steps/Sec: 10.05, Grad Norm: 0.0061 +[2025-02-21 05:57:54] (step=0319500) Train Loss: 0.1476, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 05:58:03] (step=0319600) Train Loss: 0.1472, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 05:58:11] (step=0319700) Train Loss: 0.1483, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 05:58:19] (step=0319800) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 05:58:27] (step=0319900) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 05:58:35] (step=0320000) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 05:58:39] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0320000.pt +[2025-02-21 05:58:47] (step=0320100) Train Loss: 0.1475, Train Steps/Sec: 7.90, Grad Norm: 0.0061 +[2025-02-21 05:58:56] (step=0320200) Train Loss: 0.1469, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 05:59:07] (step=0320300) Train Loss: 0.1473, Train Steps/Sec: 9.87, Grad Norm: 0.0062 +[2025-02-21 05:59:15] (step=0320400) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 05:59:22] (step=0320500) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 05:59:30] (step=0320600) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 05:59:38] (step=0320700) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 05:59:46] (step=0320800) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 05:59:54] (step=0320900) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 06:00:03] (step=0321000) Train Loss: 0.1476, Train Steps/Sec: 11.39, Grad Norm: 0.0063 +[2025-02-21 06:00:11] (step=0321100) Train Loss: 0.1488, Train Steps/Sec: 12.23, Grad Norm: 0.0063 +[2025-02-21 06:00:20] (step=0321200) Train Loss: 0.1469, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-21 06:00:29] (step=0321300) Train Loss: 0.1459, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 06:00:37] (step=0321400) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 06:00:45] (step=0321500) Train Loss: 0.1482, Train Steps/Sec: 12.05, Grad Norm: 0.0058 +[2025-02-21 06:00:56] (step=0321600) Train Loss: 0.1466, Train Steps/Sec: 8.90, Grad Norm: 0.0060 +[2025-02-21 06:01:05] (step=0321700) Train Loss: 0.1472, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 06:01:13] (step=0321800) Train Loss: 0.1465, Train Steps/Sec: 12.21, Grad Norm: 0.0062 +[2025-02-21 06:01:21] (step=0321900) Train Loss: 0.1452, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 06:01:29] (step=0322000) Train Loss: 0.1478, Train Steps/Sec: 12.65, Grad Norm: 0.0059 +[2025-02-21 06:01:37] (step=0322100) Train Loss: 0.1482, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 06:01:45] (step=0322200) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:01:53] (step=0322300) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 06:02:03] (step=0322400) Train Loss: 0.1478, Train Steps/Sec: 10.34, Grad Norm: 0.0061 +[2025-02-21 06:02:11] (step=0322500) Train Loss: 0.1457, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-21 06:02:19] (step=0322600) Train Loss: 0.1480, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 06:02:27] (step=0322700) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 06:02:36] (step=0322800) Train Loss: 0.1473, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-21 06:02:44] (step=0322900) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 06:02:52] (step=0323000) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 06:03:00] (step=0323100) Train Loss: 0.1477, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 06:03:09] (step=0323200) Train Loss: 0.1470, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 06:03:18] (step=0323300) Train Loss: 0.1461, Train Steps/Sec: 11.03, Grad Norm: 0.0061 +[2025-02-21 06:03:26] (step=0323400) Train Loss: 0.1475, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 06:03:34] (step=0323500) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:03:42] (step=0323600) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 06:03:51] (step=0323700) Train Loss: 0.1492, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 06:04:01] (step=0323800) Train Loss: 0.1463, Train Steps/Sec: 10.02, Grad Norm: 0.0058 +[2025-02-21 06:04:10] (step=0323900) Train Loss: 0.1466, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-21 06:04:18] (step=0324000) Train Loss: 0.1467, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:04:27] (step=0324100) Train Loss: 0.1479, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-21 06:04:35] (step=0324200) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:04:43] (step=0324300) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 06:04:51] (step=0324400) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:04:59] (step=0324500) Train Loss: 0.1468, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 06:05:09] (step=0324600) Train Loss: 0.1453, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-21 06:05:17] (step=0324700) Train Loss: 0.1465, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 06:05:25] (step=0324800) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:05:33] (step=0324900) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 06:05:41] (step=0325000) Train Loss: 0.1455, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 06:05:49] (step=0325100) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 06:05:57] (step=0325200) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:06:06] (step=0325300) Train Loss: 0.1473, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-21 06:06:15] (step=0325400) Train Loss: 0.1474, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 06:06:24] (step=0325500) Train Loss: 0.1465, Train Steps/Sec: 11.04, Grad Norm: 0.0062 +[2025-02-21 06:06:32] (step=0325600) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 06:06:40] (step=0325700) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 06:06:48] (step=0325800) Train Loss: 0.1470, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 06:06:58] (step=0325900) Train Loss: 0.1479, Train Steps/Sec: 10.44, Grad Norm: 0.0061 +[2025-02-21 06:07:07] (step=0326000) Train Loss: 0.1465, Train Steps/Sec: 11.03, Grad Norm: 0.0062 +[2025-02-21 06:07:16] (step=0326100) Train Loss: 0.1474, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 06:07:24] (step=0326200) Train Loss: 0.1477, Train Steps/Sec: 12.22, Grad Norm: 0.0061 +[2025-02-21 06:07:32] (step=0326300) Train Loss: 0.1481, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 06:07:40] (step=0326400) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 06:07:47] (step=0326500) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:07:56] (step=0326600) Train Loss: 0.1461, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-21 06:08:05] (step=0326700) Train Loss: 0.1479, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 06:08:14] (step=0326800) Train Loss: 0.1469, Train Steps/Sec: 10.99, Grad Norm: 0.0063 +[2025-02-21 06:08:23] (step=0326900) Train Loss: 0.1465, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 06:08:30] (step=0327000) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 06:08:38] (step=0327100) Train Loss: 0.1478, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 06:08:46] (step=0327200) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 06:08:54] (step=0327300) Train Loss: 0.1472, Train Steps/Sec: 12.65, Grad Norm: 0.0059 +[2025-02-21 06:09:02] (step=0327400) Train Loss: 0.1478, Train Steps/Sec: 12.19, Grad Norm: 0.0063 +[2025-02-21 06:09:11] (step=0327500) Train Loss: 0.1481, Train Steps/Sec: 11.81, Grad Norm: 0.0064 +[2025-02-21 06:09:20] (step=0327600) Train Loss: 0.1469, Train Steps/Sec: 10.69, Grad Norm: 0.0065 +[2025-02-21 06:09:28] (step=0327700) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 06:09:37] (step=0327800) Train Loss: 0.1482, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-21 06:09:45] (step=0327900) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 06:09:53] (step=0328000) Train Loss: 0.1461, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 06:10:03] (step=0328100) Train Loss: 0.1477, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-21 06:10:12] (step=0328200) Train Loss: 0.1462, Train Steps/Sec: 11.39, Grad Norm: 0.0057 +[2025-02-21 06:10:21] (step=0328300) Train Loss: 0.1465, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 06:10:29] (step=0328400) Train Loss: 0.1472, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 06:10:37] (step=0328500) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:10:45] (step=0328600) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:10:53] (step=0328700) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:11:01] (step=0328800) Train Loss: 0.1467, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 06:11:10] (step=0328900) Train Loss: 0.1470, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 06:11:19] (step=0329000) Train Loss: 0.1466, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-21 06:11:28] (step=0329100) Train Loss: 0.1468, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-21 06:11:36] (step=0329200) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 06:11:44] (step=0329300) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 06:11:52] (step=0329400) Train Loss: 0.1450, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 06:12:00] (step=0329500) Train Loss: 0.1468, Train Steps/Sec: 12.11, Grad Norm: 0.0063 +[2025-02-21 06:12:09] (step=0329600) Train Loss: 0.1461, Train Steps/Sec: 11.83, Grad Norm: 0.0060 +[2025-02-21 06:12:18] (step=0329700) Train Loss: 0.1463, Train Steps/Sec: 11.06, Grad Norm: 0.0060 +[2025-02-21 06:12:26] (step=0329800) Train Loss: 0.1473, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 06:12:34] (step=0329900) Train Loss: 0.1476, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 06:12:42] (step=0330000) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:12:50] (step=0330100) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 06:12:58] (step=0330200) Train Loss: 0.1473, Train Steps/Sec: 12.09, Grad Norm: 0.0060 +[2025-02-21 06:13:09] (step=0330300) Train Loss: 0.1463, Train Steps/Sec: 9.19, Grad Norm: 0.0063 +[2025-02-21 06:13:18] (step=0330400) Train Loss: 0.1466, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 06:13:26] (step=0330500) Train Loss: 0.1469, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 06:13:35] (step=0330600) Train Loss: 0.1460, Train Steps/Sec: 12.21, Grad Norm: 0.0061 +[2025-02-21 06:13:43] (step=0330700) Train Loss: 0.1470, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 06:13:51] (step=0330800) Train Loss: 0.1457, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 06:13:58] (step=0330900) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:14:06] (step=0331000) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:14:15] (step=0331100) Train Loss: 0.1465, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 06:14:24] (step=0331200) Train Loss: 0.1466, Train Steps/Sec: 10.63, Grad Norm: 0.0059 +[2025-02-21 06:14:33] (step=0331300) Train Loss: 0.1475, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 06:14:40] (step=0331400) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 06:14:48] (step=0331500) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 06:14:57] (step=0331600) Train Loss: 0.1482, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-21 06:15:06] (step=0331700) Train Loss: 0.1465, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 06:15:15] (step=0331800) Train Loss: 0.1473, Train Steps/Sec: 11.41, Grad Norm: 0.0058 +[2025-02-21 06:15:23] (step=0331900) Train Loss: 0.1478, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 06:15:32] (step=0332000) Train Loss: 0.1455, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 06:15:40] (step=0332100) Train Loss: 0.1470, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 06:15:48] (step=0332200) Train Loss: 0.1473, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 06:15:56] (step=0332300) Train Loss: 0.1464, Train Steps/Sec: 12.51, Grad Norm: 0.0061 +[2025-02-21 06:16:04] (step=0332400) Train Loss: 0.1466, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 06:16:14] (step=0332500) Train Loss: 0.1474, Train Steps/Sec: 10.02, Grad Norm: 0.0065 +[2025-02-21 06:16:22] (step=0332600) Train Loss: 0.1470, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 06:16:31] (step=0332700) Train Loss: 0.1469, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 06:16:40] (step=0332800) Train Loss: 0.1470, Train Steps/Sec: 10.78, Grad Norm: 0.0058 +[2025-02-21 06:16:48] (step=0332900) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 06:16:56] (step=0333000) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 06:17:04] (step=0333100) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 06:17:12] (step=0333200) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 06:17:21] (step=0333300) Train Loss: 0.1469, Train Steps/Sec: 11.64, Grad Norm: 0.0060 +[2025-02-21 06:17:30] (step=0333400) Train Loss: 0.1476, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-21 06:17:38] (step=0333500) Train Loss: 0.1471, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 06:17:46] (step=0333600) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:17:54] (step=0333700) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 06:18:03] (step=0333800) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 06:18:11] (step=0333900) Train Loss: 0.1464, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 06:18:20] (step=0334000) Train Loss: 0.1484, Train Steps/Sec: 11.06, Grad Norm: 0.0065 +[2025-02-21 06:18:30] (step=0334100) Train Loss: 0.1475, Train Steps/Sec: 10.49, Grad Norm: 0.0058 +[2025-02-21 06:18:38] (step=0334200) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:18:46] (step=0334300) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 06:18:53] (step=0334400) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 06:19:01] (step=0334500) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:19:10] (step=0334600) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 06:19:20] (step=0334700) Train Loss: 0.1466, Train Steps/Sec: 9.77, Grad Norm: 0.0059 +[2025-02-21 06:19:28] (step=0334800) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 06:19:37] (step=0334900) Train Loss: 0.1471, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 06:19:45] (step=0335000) Train Loss: 0.1463, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 06:19:53] (step=0335100) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 06:20:01] (step=0335200) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:20:10] (step=0335300) Train Loss: 0.1463, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-21 06:20:18] (step=0335400) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:20:26] (step=0335500) Train Loss: 0.1468, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 06:20:36] (step=0335600) Train Loss: 0.1465, Train Steps/Sec: 10.61, Grad Norm: 0.0070 +[2025-02-21 06:20:44] (step=0335700) Train Loss: 0.1461, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 06:20:52] (step=0335800) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:21:00] (step=0335900) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 06:21:08] (step=0336000) Train Loss: 0.1473, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 06:21:17] (step=0336100) Train Loss: 0.1462, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 06:21:26] (step=0336200) Train Loss: 0.1459, Train Steps/Sec: 11.41, Grad Norm: 0.0060 +[2025-02-21 06:21:34] (step=0336300) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 06:21:42] (step=0336400) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 06:21:50] (step=0336500) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 06:21:59] (step=0336600) Train Loss: 0.1468, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-21 06:22:07] (step=0336700) Train Loss: 0.1465, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 06:22:16] (step=0336800) Train Loss: 0.1472, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 06:22:25] (step=0336900) Train Loss: 0.1467, Train Steps/Sec: 10.36, Grad Norm: 0.0062 +[2025-02-21 06:22:33] (step=0337000) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 06:22:42] (step=0337100) Train Loss: 0.1477, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 06:22:50] (step=0337200) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:22:58] (step=0337300) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 06:23:06] (step=0337400) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 06:23:14] (step=0337500) Train Loss: 0.1473, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 06:23:22] (step=0337600) Train Loss: 0.1475, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 06:23:30] (step=0337700) Train Loss: 0.1478, Train Steps/Sec: 11.72, Grad Norm: 0.0065 +[2025-02-21 06:23:41] (step=0337800) Train Loss: 0.1469, Train Steps/Sec: 9.66, Grad Norm: 0.0062 +[2025-02-21 06:23:49] (step=0337900) Train Loss: 0.1485, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:23:57] (step=0338000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 06:24:05] (step=0338100) Train Loss: 0.1477, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 06:24:14] (step=0338200) Train Loss: 0.1476, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 06:24:23] (step=0338300) Train Loss: 0.1461, Train Steps/Sec: 11.43, Grad Norm: 0.0060 +[2025-02-21 06:24:31] (step=0338400) Train Loss: 0.1486, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 06:24:39] (step=0338500) Train Loss: 0.1468, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 06:24:47] (step=0338600) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 06:24:55] (step=0338700) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:25:03] (step=0338800) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 06:25:11] (step=0338900) Train Loss: 0.1465, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 06:25:20] (step=0339000) Train Loss: 0.1464, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 06:25:30] (step=0339100) Train Loss: 0.1467, Train Steps/Sec: 9.42, Grad Norm: 0.0062 +[2025-02-21 06:25:39] (step=0339200) Train Loss: 0.1470, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 06:25:47] (step=0339300) Train Loss: 0.1466, Train Steps/Sec: 11.33, Grad Norm: 0.0064 +[2025-02-21 06:25:55] (step=0339400) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:26:03] (step=0339500) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 06:26:11] (step=0339600) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:26:19] (step=0339700) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:26:27] (step=0339800) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 06:26:36] (step=0339900) Train Loss: 0.1476, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 06:26:45] (step=0340000) Train Loss: 0.1460, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-21 06:26:50] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0340000.pt +[2025-02-21 06:26:58] (step=0340100) Train Loss: 0.1471, Train Steps/Sec: 7.78, Grad Norm: 0.0060 +[2025-02-21 06:27:06] (step=0340200) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:27:16] (step=0340300) Train Loss: 0.1464, Train Steps/Sec: 10.23, Grad Norm: 0.0060 +[2025-02-21 06:27:25] (step=0340400) Train Loss: 0.1466, Train Steps/Sec: 11.05, Grad Norm: 0.0059 +[2025-02-21 06:27:33] (step=0340500) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 06:27:41] (step=0340600) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 06:27:49] (step=0340700) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 06:27:57] (step=0340800) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:28:05] (step=0340900) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 06:28:13] (step=0341000) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 06:28:21] (step=0341100) Train Loss: 0.1478, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 06:28:30] (step=0341200) Train Loss: 0.1469, Train Steps/Sec: 11.05, Grad Norm: 0.0061 +[2025-02-21 06:28:40] (step=0341300) Train Loss: 0.1468, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-21 06:28:48] (step=0341400) Train Loss: 0.1464, Train Steps/Sec: 11.70, Grad Norm: 0.0064 +[2025-02-21 06:28:57] (step=0341500) Train Loss: 0.1470, Train Steps/Sec: 11.67, Grad Norm: 0.0058 +[2025-02-21 06:29:06] (step=0341600) Train Loss: 0.1467, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-21 06:29:14] (step=0341700) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 06:29:22] (step=0341800) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 06:29:30] (step=0341900) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 06:29:38] (step=0342000) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:29:46] (step=0342100) Train Loss: 0.1482, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 06:29:56] (step=0342200) Train Loss: 0.1470, Train Steps/Sec: 10.60, Grad Norm: 0.0063 +[2025-02-21 06:30:04] (step=0342300) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 06:30:13] (step=0342400) Train Loss: 0.1462, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 06:30:21] (step=0342500) Train Loss: 0.1452, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 06:30:30] (step=0342600) Train Loss: 0.1461, Train Steps/Sec: 11.43, Grad Norm: 0.0059 +[2025-02-21 06:30:38] (step=0342700) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:30:47] (step=0342800) Train Loss: 0.1472, Train Steps/Sec: 10.90, Grad Norm: 0.0058 +[2025-02-21 06:30:55] (step=0342900) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 06:31:03] (step=0343000) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 06:31:11] (step=0343100) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 06:31:19] (step=0343200) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 06:31:27] (step=0343300) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:31:36] (step=0343400) Train Loss: 0.1469, Train Steps/Sec: 10.76, Grad Norm: 0.0060 +[2025-02-21 06:31:46] (step=0343500) Train Loss: 0.1478, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 06:31:54] (step=0343600) Train Loss: 0.1461, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 06:32:03] (step=0343700) Train Loss: 0.1476, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 06:32:11] (step=0343800) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 06:32:19] (step=0343900) Train Loss: 0.1471, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 06:32:27] (step=0344000) Train Loss: 0.1472, Train Steps/Sec: 12.48, Grad Norm: 0.0062 +[2025-02-21 06:32:35] (step=0344100) Train Loss: 0.1459, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 06:32:44] (step=0344200) Train Loss: 0.1472, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 06:32:53] (step=0344300) Train Loss: 0.1469, Train Steps/Sec: 11.71, Grad Norm: 0.0064 +[2025-02-21 06:33:02] (step=0344400) Train Loss: 0.1468, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 06:33:10] (step=0344500) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 06:33:18] (step=0344600) Train Loss: 0.1479, Train Steps/Sec: 11.76, Grad Norm: 0.0063 +[2025-02-21 06:33:27] (step=0344700) Train Loss: 0.1475, Train Steps/Sec: 11.10, Grad Norm: 0.0061 +[2025-02-21 06:33:36] (step=0344800) Train Loss: 0.1473, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 06:33:44] (step=0344900) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 06:33:52] (step=0345000) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:34:00] (step=0345100) Train Loss: 0.1468, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 06:34:08] (step=0345200) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:34:17] (step=0345300) Train Loss: 0.1466, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 06:34:24] (step=0345400) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:34:33] (step=0345500) Train Loss: 0.1466, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 06:34:43] (step=0345600) Train Loss: 0.1466, Train Steps/Sec: 10.07, Grad Norm: 0.0059 +[2025-02-21 06:34:51] (step=0345700) Train Loss: 0.1470, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 06:35:00] (step=0345800) Train Loss: 0.1463, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-21 06:35:09] (step=0345900) Train Loss: 0.1467, Train Steps/Sec: 12.10, Grad Norm: 0.0063 +[2025-02-21 06:35:17] (step=0346000) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 06:35:24] (step=0346100) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 06:35:32] (step=0346200) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 06:35:40] (step=0346300) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 06:35:48] (step=0346400) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 06:35:57] (step=0346500) Train Loss: 0.1473, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 06:36:07] (step=0346600) Train Loss: 0.1469, Train Steps/Sec: 9.66, Grad Norm: 0.0061 +[2025-02-21 06:36:16] (step=0346700) Train Loss: 0.1481, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 06:36:25] (step=0346800) Train Loss: 0.1478, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 06:36:34] (step=0346900) Train Loss: 0.1459, Train Steps/Sec: 11.81, Grad Norm: 0.0063 +[2025-02-21 06:36:41] (step=0347000) Train Loss: 0.1478, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 06:36:50] (step=0347100) Train Loss: 0.1466, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 06:36:58] (step=0347200) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 06:37:06] (step=0347300) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:37:13] (step=0347400) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 06:37:21] (step=0347500) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:37:29] (step=0347600) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 06:37:38] (step=0347700) Train Loss: 0.1472, Train Steps/Sec: 12.20, Grad Norm: 0.0063 +[2025-02-21 06:37:49] (step=0347800) Train Loss: 0.1461, Train Steps/Sec: 8.92, Grad Norm: 0.0062 +[2025-02-21 06:37:58] (step=0347900) Train Loss: 0.1475, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 06:38:06] (step=0348000) Train Loss: 0.1471, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 06:38:14] (step=0348100) Train Loss: 0.1464, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 06:38:22] (step=0348200) Train Loss: 0.1483, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:38:30] (step=0348300) Train Loss: 0.1470, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 06:38:38] (step=0348400) Train Loss: 0.1474, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 06:38:46] (step=0348500) Train Loss: 0.1480, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 06:38:54] (step=0348600) Train Loss: 0.1474, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 06:39:03] (step=0348700) Train Loss: 0.1461, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 06:39:12] (step=0348800) Train Loss: 0.1466, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 06:39:21] (step=0348900) Train Loss: 0.1463, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 06:39:30] (step=0349000) Train Loss: 0.1465, Train Steps/Sec: 11.03, Grad Norm: 0.0060 +[2025-02-21 06:39:39] (step=0349100) Train Loss: 0.1469, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-21 06:39:47] (step=0349200) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 06:39:55] (step=0349300) Train Loss: 0.1473, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:40:03] (step=0349400) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 06:40:11] (step=0349500) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 06:40:19] (step=0349600) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:40:27] (step=0349700) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 06:40:35] (step=0349800) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 06:40:43] (step=0349900) Train Loss: 0.1462, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 06:40:53] (step=0350000) Train Loss: 0.1462, Train Steps/Sec: 9.75, Grad Norm: 0.0060 +[2025-02-21 06:41:02] (step=0350100) Train Loss: 0.1449, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 06:41:10] (step=0350200) Train Loss: 0.1469, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 06:41:20] (step=0350300) Train Loss: 0.1468, Train Steps/Sec: 10.85, Grad Norm: 0.0064 +[2025-02-21 06:41:28] (step=0350400) Train Loss: 0.1483, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 06:41:36] (step=0350500) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 06:41:43] (step=0350600) Train Loss: 0.1481, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 06:41:51] (step=0350700) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 06:42:00] (step=0350800) Train Loss: 0.1471, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 06:42:09] (step=0350900) Train Loss: 0.1469, Train Steps/Sec: 10.92, Grad Norm: 0.0057 +[2025-02-21 06:42:18] (step=0351000) Train Loss: 0.1469, Train Steps/Sec: 10.97, Grad Norm: 0.0063 +[2025-02-21 06:42:27] (step=0351100) Train Loss: 0.1473, Train Steps/Sec: 11.04, Grad Norm: 0.0059 +[2025-02-21 06:42:36] (step=0351200) Train Loss: 0.1468, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 06:42:43] (step=0351300) Train Loss: 0.1480, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 06:42:51] (step=0351400) Train Loss: 0.1478, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 06:43:00] (step=0351500) Train Loss: 0.1473, Train Steps/Sec: 12.09, Grad Norm: 0.0060 +[2025-02-21 06:43:08] (step=0351600) Train Loss: 0.1477, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-21 06:43:16] (step=0351700) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:43:24] (step=0351800) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 06:43:32] (step=0351900) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 06:43:40] (step=0352000) Train Loss: 0.1458, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 06:43:49] (step=0352100) Train Loss: 0.1469, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 06:43:59] (step=0352200) Train Loss: 0.1460, Train Steps/Sec: 9.55, Grad Norm: 0.0060 +[2025-02-21 06:44:08] (step=0352300) Train Loss: 0.1468, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 06:44:16] (step=0352400) Train Loss: 0.1470, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 06:44:24] (step=0352500) Train Loss: 0.1472, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 06:44:32] (step=0352600) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 06:44:40] (step=0352700) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 06:44:49] (step=0352800) Train Loss: 0.1468, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-21 06:44:57] (step=0352900) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:45:06] (step=0353000) Train Loss: 0.1459, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 06:45:15] (step=0353100) Train Loss: 0.1464, Train Steps/Sec: 10.28, Grad Norm: 0.0064 +[2025-02-21 06:45:24] (step=0353200) Train Loss: 0.1466, Train Steps/Sec: 11.05, Grad Norm: 0.0064 +[2025-02-21 06:45:33] (step=0353300) Train Loss: 0.1463, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 06:45:41] (step=0353400) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 06:45:49] (step=0353500) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 06:45:57] (step=0353600) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 06:46:05] (step=0353700) Train Loss: 0.1476, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 06:46:13] (step=0353800) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 06:46:21] (step=0353900) Train Loss: 0.1461, Train Steps/Sec: 12.65, Grad Norm: 0.0062 +[2025-02-21 06:46:29] (step=0354000) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 06:46:38] (step=0354100) Train Loss: 0.1462, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 06:46:46] (step=0354200) Train Loss: 0.1462, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 06:46:54] (step=0354300) Train Loss: 0.1474, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 06:47:05] (step=0354400) Train Loss: 0.1466, Train Steps/Sec: 9.51, Grad Norm: 0.0062 +[2025-02-21 06:47:13] (step=0354500) Train Loss: 0.1475, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 06:47:21] (step=0354600) Train Loss: 0.1477, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 06:47:30] (step=0354700) Train Loss: 0.1476, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 06:47:37] (step=0354800) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 06:47:45] (step=0354900) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 06:47:53] (step=0355000) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 06:48:01] (step=0355100) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 06:48:11] (step=0355200) Train Loss: 0.1461, Train Steps/Sec: 10.63, Grad Norm: 0.0058 +[2025-02-21 06:48:21] (step=0355300) Train Loss: 0.1456, Train Steps/Sec: 9.40, Grad Norm: 0.0059 +[2025-02-21 06:48:30] (step=0355400) Train Loss: 0.1461, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 06:48:38] (step=0355500) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 06:48:46] (step=0355600) Train Loss: 0.1481, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 06:48:54] (step=0355700) Train Loss: 0.1489, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 06:49:02] (step=0355800) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 06:49:10] (step=0355900) Train Loss: 0.1461, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 06:49:18] (step=0356000) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 06:49:26] (step=0356100) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 06:49:34] (step=0356200) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 06:49:42] (step=0356300) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:49:50] (step=0356400) Train Loss: 0.1471, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 06:49:59] (step=0356500) Train Loss: 0.1472, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 06:50:10] (step=0356600) Train Loss: 0.1475, Train Steps/Sec: 9.00, Grad Norm: 0.0061 +[2025-02-21 06:50:19] (step=0356700) Train Loss: 0.1462, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 06:50:27] (step=0356800) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:50:35] (step=0356900) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 06:50:43] (step=0357000) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 06:50:51] (step=0357100) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 06:50:59] (step=0357200) Train Loss: 0.1484, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 06:51:07] (step=0357300) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 06:51:16] (step=0357400) Train Loss: 0.1462, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 06:51:26] (step=0357500) Train Loss: 0.1459, Train Steps/Sec: 10.26, Grad Norm: 0.0059 +[2025-02-21 06:51:35] (step=0357600) Train Loss: 0.1477, Train Steps/Sec: 10.88, Grad Norm: 0.0059 +[2025-02-21 06:51:43] (step=0357700) Train Loss: 0.1460, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 06:51:52] (step=0357800) Train Loss: 0.1455, Train Steps/Sec: 11.17, Grad Norm: 0.0058 +[2025-02-21 06:52:00] (step=0357900) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 06:52:08] (step=0358000) Train Loss: 0.1466, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 06:52:16] (step=0358100) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 06:52:24] (step=0358200) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 06:52:32] (step=0358300) Train Loss: 0.1479, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 06:52:40] (step=0358400) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 06:52:48] (step=0358500) Train Loss: 0.1468, Train Steps/Sec: 12.55, Grad Norm: 0.0063 +[2025-02-21 06:52:56] (step=0358600) Train Loss: 0.1470, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 06:53:06] (step=0358700) Train Loss: 0.1464, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 06:53:15] (step=0358800) Train Loss: 0.1477, Train Steps/Sec: 10.87, Grad Norm: 0.0061 +[2025-02-21 06:53:24] (step=0358900) Train Loss: 0.1469, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-21 06:53:32] (step=0359000) Train Loss: 0.1467, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 06:53:41] (step=0359100) Train Loss: 0.1478, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-21 06:53:49] (step=0359200) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 06:53:57] (step=0359300) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 06:54:05] (step=0359400) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 06:54:13] (step=0359500) Train Loss: 0.1459, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 06:54:23] (step=0359600) Train Loss: 0.1474, Train Steps/Sec: 10.70, Grad Norm: 0.0063 +[2025-02-21 06:54:33] (step=0359700) Train Loss: 0.1464, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-21 06:54:41] (step=0359800) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 06:54:49] (step=0359900) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 06:54:57] (step=0360000) Train Loss: 0.1476, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 06:55:01] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0360000.pt +[2025-02-21 06:55:09] (step=0360100) Train Loss: 0.1479, Train Steps/Sec: 8.46, Grad Norm: 0.0062 +[2025-02-21 06:55:17] (step=0360200) Train Loss: 0.1472, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:55:26] (step=0360300) Train Loss: 0.1474, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 06:55:34] (step=0360400) Train Loss: 0.1462, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 06:55:42] (step=0360500) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 06:55:50] (step=0360600) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 06:55:58] (step=0360700) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:56:06] (step=0360800) Train Loss: 0.1459, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 06:56:15] (step=0360900) Train Loss: 0.1450, Train Steps/Sec: 10.70, Grad Norm: 0.0060 +[2025-02-21 06:56:24] (step=0361000) Train Loss: 0.1455, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 06:56:33] (step=0361100) Train Loss: 0.1462, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-21 06:56:41] (step=0361200) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 06:56:49] (step=0361300) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 06:56:57] (step=0361400) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 06:57:05] (step=0361500) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 06:57:15] (step=0361600) Train Loss: 0.1468, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-21 06:57:24] (step=0361700) Train Loss: 0.1457, Train Steps/Sec: 11.02, Grad Norm: 0.0058 +[2025-02-21 06:57:33] (step=0361800) Train Loss: 0.1453, Train Steps/Sec: 10.67, Grad Norm: 0.0057 +[2025-02-21 06:57:42] (step=0361900) Train Loss: 0.1469, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 06:57:50] (step=0362000) Train Loss: 0.1471, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 06:57:58] (step=0362100) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 06:58:06] (step=0362200) Train Loss: 0.1484, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 06:58:14] (step=0362300) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 06:58:23] (step=0362400) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 06:58:30] (step=0362500) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 06:58:38] (step=0362600) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:58:46] (step=0362700) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 06:58:55] (step=0362800) Train Loss: 0.1461, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 06:59:03] (step=0362900) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 06:59:12] (step=0363000) Train Loss: 0.1462, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 06:59:22] (step=0363100) Train Loss: 0.1469, Train Steps/Sec: 9.75, Grad Norm: 0.0061 +[2025-02-21 06:59:30] (step=0363200) Train Loss: 0.1476, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 06:59:39] (step=0363300) Train Loss: 0.1472, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 06:59:47] (step=0363400) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 06:59:55] (step=0363500) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 07:00:03] (step=0363600) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:00:11] (step=0363700) Train Loss: 0.1474, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 07:00:20] (step=0363800) Train Loss: 0.1475, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 07:00:29] (step=0363900) Train Loss: 0.1475, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 07:00:38] (step=0364000) Train Loss: 0.1470, Train Steps/Sec: 10.35, Grad Norm: 0.0059 +[2025-02-21 07:00:48] (step=0364100) Train Loss: 0.1461, Train Steps/Sec: 10.53, Grad Norm: 0.0063 +[2025-02-21 07:00:56] (step=0364200) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 07:01:04] (step=0364300) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:01:12] (step=0364400) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 07:01:20] (step=0364500) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:01:28] (step=0364600) Train Loss: 0.1452, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 07:01:36] (step=0364700) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 07:01:44] (step=0364800) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 07:01:52] (step=0364900) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 07:02:00] (step=0365000) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 07:02:08] (step=0365100) Train Loss: 0.1466, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 07:02:16] (step=0365200) Train Loss: 0.1464, Train Steps/Sec: 12.22, Grad Norm: 0.0061 +[2025-02-21 07:02:27] (step=0365300) Train Loss: 0.1467, Train Steps/Sec: 8.97, Grad Norm: 0.0060 +[2025-02-21 07:02:36] (step=0365400) Train Loss: 0.1460, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-21 07:02:44] (step=0365500) Train Loss: 0.1472, Train Steps/Sec: 11.67, Grad Norm: 0.0064 +[2025-02-21 07:02:53] (step=0365600) Train Loss: 0.1472, Train Steps/Sec: 12.07, Grad Norm: 0.0061 +[2025-02-21 07:03:01] (step=0365700) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 07:03:08] (step=0365800) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:03:17] (step=0365900) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 07:03:26] (step=0366000) Train Loss: 0.1464, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 07:03:35] (step=0366100) Train Loss: 0.1471, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 07:03:44] (step=0366200) Train Loss: 0.1474, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 07:03:52] (step=0366300) Train Loss: 0.1478, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 07:04:01] (step=0366400) Train Loss: 0.1470, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 07:04:08] (step=0366500) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 07:04:17] (step=0366600) Train Loss: 0.1475, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 07:04:26] (step=0366700) Train Loss: 0.1473, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 07:04:34] (step=0366800) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:04:41] (step=0366900) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:04:49] (step=0367000) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 07:04:57] (step=0367100) Train Loss: 0.1484, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:05:05] (step=0367200) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 07:05:13] (step=0367300) Train Loss: 0.1481, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 07:05:22] (step=0367400) Train Loss: 0.1464, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 07:05:32] (step=0367500) Train Loss: 0.1453, Train Steps/Sec: 9.46, Grad Norm: 0.0059 +[2025-02-21 07:05:40] (step=0367600) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 07:05:49] (step=0367700) Train Loss: 0.1461, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-21 07:05:58] (step=0367800) Train Loss: 0.1466, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 07:06:06] (step=0367900) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 07:06:15] (step=0368000) Train Loss: 0.1475, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 07:06:23] (step=0368100) Train Loss: 0.1459, Train Steps/Sec: 11.82, Grad Norm: 0.0063 +[2025-02-21 07:06:32] (step=0368200) Train Loss: 0.1466, Train Steps/Sec: 11.00, Grad Norm: 0.0058 +[2025-02-21 07:06:40] (step=0368300) Train Loss: 0.1477, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 07:06:49] (step=0368400) Train Loss: 0.1467, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 07:06:58] (step=0368500) Train Loss: 0.1465, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 07:07:06] (step=0368600) Train Loss: 0.1462, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 07:07:14] (step=0368700) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 07:07:22] (step=0368800) Train Loss: 0.1484, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:07:30] (step=0368900) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 07:07:38] (step=0369000) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 07:07:47] (step=0369100) Train Loss: 0.1468, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 07:07:55] (step=0369200) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 07:08:03] (step=0369300) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:08:11] (step=0369400) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 07:08:19] (step=0369500) Train Loss: 0.1453, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 07:08:28] (step=0369600) Train Loss: 0.1468, Train Steps/Sec: 11.42, Grad Norm: 0.0061 +[2025-02-21 07:08:38] (step=0369700) Train Loss: 0.1474, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-21 07:08:46] (step=0369800) Train Loss: 0.1471, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 07:08:55] (step=0369900) Train Loss: 0.1471, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 07:09:03] (step=0370000) Train Loss: 0.1471, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 07:09:11] (step=0370100) Train Loss: 0.1466, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 07:09:20] (step=0370200) Train Loss: 0.1468, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 07:09:29] (step=0370300) Train Loss: 0.1464, Train Steps/Sec: 10.25, Grad Norm: 0.0058 +[2025-02-21 07:09:38] (step=0370400) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 07:09:46] (step=0370500) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 07:09:55] (step=0370600) Train Loss: 0.1479, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 07:10:04] (step=0370700) Train Loss: 0.1472, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 07:10:12] (step=0370800) Train Loss: 0.1472, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 07:10:20] (step=0370900) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:10:28] (step=0371000) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:10:36] (step=0371100) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 07:10:44] (step=0371200) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:10:52] (step=0371300) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:11:00] (step=0371400) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:11:08] (step=0371500) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:11:17] (step=0371600) Train Loss: 0.1466, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-21 07:11:25] (step=0371700) Train Loss: 0.1477, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 07:11:34] (step=0371800) Train Loss: 0.1476, Train Steps/Sec: 10.64, Grad Norm: 0.0061 +[2025-02-21 07:11:44] (step=0371900) Train Loss: 0.1463, Train Steps/Sec: 10.66, Grad Norm: 0.0060 +[2025-02-21 07:11:53] (step=0372000) Train Loss: 0.1472, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 07:12:01] (step=0372100) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 07:12:09] (step=0372200) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 07:12:17] (step=0372300) Train Loss: 0.1466, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 07:12:26] (step=0372400) Train Loss: 0.1461, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 07:12:35] (step=0372500) Train Loss: 0.1465, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-21 07:12:43] (step=0372600) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 07:12:51] (step=0372700) Train Loss: 0.1473, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 07:13:01] (step=0372800) Train Loss: 0.1454, Train Steps/Sec: 10.24, Grad Norm: 0.0060 +[2025-02-21 07:13:10] (step=0372900) Train Loss: 0.1462, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 07:13:18] (step=0373000) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 07:13:26] (step=0373100) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 07:13:34] (step=0373200) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:13:42] (step=0373300) Train Loss: 0.1471, Train Steps/Sec: 12.13, Grad Norm: 0.0058 +[2025-02-21 07:13:50] (step=0373400) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 07:13:58] (step=0373500) Train Loss: 0.1477, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 07:14:06] (step=0373600) Train Loss: 0.1467, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 07:14:14] (step=0373700) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 07:14:22] (step=0373800) Train Loss: 0.1481, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 07:14:30] (step=0373900) Train Loss: 0.1472, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 07:14:40] (step=0374000) Train Loss: 0.1464, Train Steps/Sec: 10.33, Grad Norm: 0.0063 +[2025-02-21 07:14:50] (step=0374100) Train Loss: 0.1468, Train Steps/Sec: 9.66, Grad Norm: 0.0061 +[2025-02-21 07:14:59] (step=0374200) Train Loss: 0.1472, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 07:15:07] (step=0374300) Train Loss: 0.1473, Train Steps/Sec: 12.15, Grad Norm: 0.0066 +[2025-02-21 07:15:15] (step=0374400) Train Loss: 0.1473, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 07:15:24] (step=0374500) Train Loss: 0.1477, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 07:15:33] (step=0374600) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 07:15:41] (step=0374700) Train Loss: 0.1452, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 07:15:49] (step=0374800) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 07:15:57] (step=0374900) Train Loss: 0.1455, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 07:16:06] (step=0375000) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 07:16:15] (step=0375100) Train Loss: 0.1453, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 07:16:23] (step=0375200) Train Loss: 0.1467, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 07:16:31] (step=0375300) Train Loss: 0.1462, Train Steps/Sec: 12.00, Grad Norm: 0.0060 +[2025-02-21 07:16:40] (step=0375400) Train Loss: 0.1493, Train Steps/Sec: 11.79, Grad Norm: 0.0061 +[2025-02-21 07:16:48] (step=0375500) Train Loss: 0.1465, Train Steps/Sec: 11.99, Grad Norm: 0.0060 +[2025-02-21 07:16:56] (step=0375600) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 07:17:04] (step=0375700) Train Loss: 0.1468, Train Steps/Sec: 12.49, Grad Norm: 0.0064 +[2025-02-21 07:17:12] (step=0375800) Train Loss: 0.1482, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:17:20] (step=0375900) Train Loss: 0.1476, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 07:17:28] (step=0376000) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:17:36] (step=0376100) Train Loss: 0.1463, Train Steps/Sec: 12.20, Grad Norm: 0.0063 +[2025-02-21 07:17:46] (step=0376200) Train Loss: 0.1457, Train Steps/Sec: 9.78, Grad Norm: 0.0064 +[2025-02-21 07:17:55] (step=0376300) Train Loss: 0.1471, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 07:18:04] (step=0376400) Train Loss: 0.1477, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 07:18:12] (step=0376500) Train Loss: 0.1474, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-21 07:18:22] (step=0376600) Train Loss: 0.1480, Train Steps/Sec: 10.41, Grad Norm: 0.0058 +[2025-02-21 07:18:30] (step=0376700) Train Loss: 0.1468, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 07:18:39] (step=0376800) Train Loss: 0.1451, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 07:18:47] (step=0376900) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:18:55] (step=0377000) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 07:19:04] (step=0377100) Train Loss: 0.1473, Train Steps/Sec: 11.42, Grad Norm: 0.0064 +[2025-02-21 07:19:13] (step=0377200) Train Loss: 0.1469, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 07:19:21] (step=0377300) Train Loss: 0.1477, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 07:19:29] (step=0377400) Train Loss: 0.1468, Train Steps/Sec: 12.65, Grad Norm: 0.0058 +[2025-02-21 07:19:37] (step=0377500) Train Loss: 0.1473, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 07:19:45] (step=0377600) Train Loss: 0.1474, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 07:19:53] (step=0377700) Train Loss: 0.1482, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 07:20:01] (step=0377800) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 07:20:10] (step=0377900) Train Loss: 0.1457, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 07:20:18] (step=0378000) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 07:20:26] (step=0378100) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 07:20:34] (step=0378200) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:20:42] (step=0378300) Train Loss: 0.1477, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 07:20:52] (step=0378400) Train Loss: 0.1479, Train Steps/Sec: 9.74, Grad Norm: 0.0060 +[2025-02-21 07:21:01] (step=0378500) Train Loss: 0.1461, Train Steps/Sec: 11.40, Grad Norm: 0.0058 +[2025-02-21 07:21:09] (step=0378600) Train Loss: 0.1459, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 07:21:18] (step=0378700) Train Loss: 0.1458, Train Steps/Sec: 11.04, Grad Norm: 0.0060 +[2025-02-21 07:21:27] (step=0378800) Train Loss: 0.1471, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 07:21:36] (step=0378900) Train Loss: 0.1458, Train Steps/Sec: 11.02, Grad Norm: 0.0060 +[2025-02-21 07:21:44] (step=0379000) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 07:21:52] (step=0379100) Train Loss: 0.1466, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 07:22:00] (step=0379200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:22:09] (step=0379300) Train Loss: 0.1467, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 07:22:18] (step=0379400) Train Loss: 0.1456, Train Steps/Sec: 10.93, Grad Norm: 0.0058 +[2025-02-21 07:22:27] (step=0379500) Train Loss: 0.1454, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 07:22:35] (step=0379600) Train Loss: 0.1467, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 07:22:42] (step=0379700) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:22:50] (step=0379800) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:22:59] (step=0379900) Train Loss: 0.1477, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 07:23:07] (step=0380000) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:23:10] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0380000.pt +[2025-02-21 07:23:18] (step=0380100) Train Loss: 0.1464, Train Steps/Sec: 9.06, Grad Norm: 0.0063 +[2025-02-21 07:23:25] (step=0380200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 07:23:33] (step=0380300) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 07:23:43] (step=0380400) Train Loss: 0.1481, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 07:23:51] (step=0380500) Train Loss: 0.1485, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 07:24:01] (step=0380600) Train Loss: 0.1460, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-21 07:24:10] (step=0380700) Train Loss: 0.1458, Train Steps/Sec: 11.37, Grad Norm: 0.0057 +[2025-02-21 07:24:19] (step=0380800) Train Loss: 0.1478, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 07:24:27] (step=0380900) Train Loss: 0.1464, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 07:24:36] (step=0381000) Train Loss: 0.1467, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 07:24:44] (step=0381100) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 07:24:52] (step=0381200) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:25:00] (step=0381300) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 07:25:08] (step=0381400) Train Loss: 0.1483, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 07:25:17] (step=0381500) Train Loss: 0.1455, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 07:25:27] (step=0381600) Train Loss: 0.1465, Train Steps/Sec: 9.91, Grad Norm: 0.0059 +[2025-02-21 07:25:35] (step=0381700) Train Loss: 0.1460, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 07:25:43] (step=0381800) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 07:25:51] (step=0381900) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:25:59] (step=0382000) Train Loss: 0.1465, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 07:26:07] (step=0382100) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:26:15] (step=0382200) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 07:26:23] (step=0382300) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:26:31] (step=0382400) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 07:26:39] (step=0382500) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:26:47] (step=0382600) Train Loss: 0.1459, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 07:26:56] (step=0382700) Train Loss: 0.1470, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 07:27:06] (step=0382800) Train Loss: 0.1456, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-21 07:27:15] (step=0382900) Train Loss: 0.1461, Train Steps/Sec: 10.27, Grad Norm: 0.0059 +[2025-02-21 07:27:25] (step=0383000) Train Loss: 0.1473, Train Steps/Sec: 10.59, Grad Norm: 0.0059 +[2025-02-21 07:27:33] (step=0383100) Train Loss: 0.1459, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 07:27:42] (step=0383200) Train Loss: 0.1456, Train Steps/Sec: 10.99, Grad Norm: 0.0058 +[2025-02-21 07:27:50] (step=0383300) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 07:27:58] (step=0383400) Train Loss: 0.1456, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 07:28:06] (step=0383500) Train Loss: 0.1468, Train Steps/Sec: 12.64, Grad Norm: 0.0066 +[2025-02-21 07:28:14] (step=0383600) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 07:28:23] (step=0383700) Train Loss: 0.1478, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 07:28:32] (step=0383800) Train Loss: 0.1463, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-21 07:28:40] (step=0383900) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 07:28:48] (step=0384000) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:28:57] (step=0384100) Train Loss: 0.1462, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 07:29:05] (step=0384200) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 07:29:13] (step=0384300) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 07:29:21] (step=0384400) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 07:29:29] (step=0384500) Train Loss: 0.1462, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 07:29:37] (step=0384600) Train Loss: 0.1472, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 07:29:44] (step=0384700) Train Loss: 0.1462, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 07:29:53] (step=0384800) Train Loss: 0.1462, Train Steps/Sec: 12.24, Grad Norm: 0.0060 +[2025-02-21 07:30:01] (step=0384900) Train Loss: 0.1455, Train Steps/Sec: 11.43, Grad Norm: 0.0060 +[2025-02-21 07:30:11] (step=0385000) Train Loss: 0.1455, Train Steps/Sec: 10.64, Grad Norm: 0.0060 +[2025-02-21 07:30:20] (step=0385100) Train Loss: 0.1457, Train Steps/Sec: 10.65, Grad Norm: 0.0061 +[2025-02-21 07:30:29] (step=0385200) Train Loss: 0.1465, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 07:30:38] (step=0385300) Train Loss: 0.1475, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 07:30:47] (step=0385400) Train Loss: 0.1466, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-21 07:30:55] (step=0385500) Train Loss: 0.1457, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 07:31:03] (step=0385600) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 07:31:11] (step=0385700) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 07:31:19] (step=0385800) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 07:31:28] (step=0385900) Train Loss: 0.1459, Train Steps/Sec: 11.41, Grad Norm: 0.0058 +[2025-02-21 07:31:37] (step=0386000) Train Loss: 0.1465, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 07:31:45] (step=0386100) Train Loss: 0.1475, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 07:31:53] (step=0386200) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 07:32:01] (step=0386300) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:32:09] (step=0386400) Train Loss: 0.1463, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 07:32:17] (step=0386500) Train Loss: 0.1468, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:32:26] (step=0386600) Train Loss: 0.1458, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-21 07:32:34] (step=0386700) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 07:32:42] (step=0386800) Train Loss: 0.1487, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:32:50] (step=0386900) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 07:32:58] (step=0387000) Train Loss: 0.1471, Train Steps/Sec: 12.22, Grad Norm: 0.0058 +[2025-02-21 07:33:07] (step=0387100) Train Loss: 0.1480, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 07:33:18] (step=0387200) Train Loss: 0.1483, Train Steps/Sec: 9.50, Grad Norm: 0.0062 +[2025-02-21 07:33:26] (step=0387300) Train Loss: 0.1465, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 07:33:36] (step=0387400) Train Loss: 0.1463, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 07:33:44] (step=0387500) Train Loss: 0.1464, Train Steps/Sec: 11.80, Grad Norm: 0.0059 +[2025-02-21 07:33:52] (step=0387600) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:34:00] (step=0387700) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 07:34:08] (step=0387800) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 07:34:17] (step=0387900) Train Loss: 0.1470, Train Steps/Sec: 11.31, Grad Norm: 0.0058 +[2025-02-21 07:34:25] (step=0388000) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:34:33] (step=0388100) Train Loss: 0.1468, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 07:34:42] (step=0388200) Train Loss: 0.1455, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 07:34:50] (step=0388300) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 07:34:58] (step=0388400) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:35:06] (step=0388500) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:35:15] (step=0388600) Train Loss: 0.1474, Train Steps/Sec: 12.13, Grad Norm: 0.0065 +[2025-02-21 07:35:22] (step=0388700) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 07:35:30] (step=0388800) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:35:38] (step=0388900) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:35:46] (step=0389000) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:35:55] (step=0389100) Train Loss: 0.1464, Train Steps/Sec: 11.28, Grad Norm: 0.0057 +[2025-02-21 07:36:03] (step=0389200) Train Loss: 0.1460, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 07:36:13] (step=0389300) Train Loss: 0.1472, Train Steps/Sec: 10.33, Grad Norm: 0.0062 +[2025-02-21 07:36:23] (step=0389400) Train Loss: 0.1463, Train Steps/Sec: 9.77, Grad Norm: 0.0060 +[2025-02-21 07:36:32] (step=0389500) Train Loss: 0.1469, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 07:36:41] (step=0389600) Train Loss: 0.1451, Train Steps/Sec: 10.67, Grad Norm: 0.0058 +[2025-02-21 07:36:49] (step=0389700) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 07:36:57] (step=0389800) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 07:37:05] (step=0389900) Train Loss: 0.1473, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 07:37:13] (step=0390000) Train Loss: 0.1470, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 07:37:21] (step=0390100) Train Loss: 0.1484, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 07:37:29] (step=0390200) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:37:38] (step=0390300) Train Loss: 0.1464, Train Steps/Sec: 11.32, Grad Norm: 0.0066 +[2025-02-21 07:37:48] (step=0390400) Train Loss: 0.1469, Train Steps/Sec: 10.28, Grad Norm: 0.0061 +[2025-02-21 07:37:56] (step=0390500) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 07:38:04] (step=0390600) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:38:12] (step=0390700) Train Loss: 0.1462, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 07:38:20] (step=0390800) Train Loss: 0.1468, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 07:38:28] (step=0390900) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:38:36] (step=0391000) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:38:44] (step=0391100) Train Loss: 0.1481, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:38:52] (step=0391200) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 07:38:59] (step=0391300) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:39:08] (step=0391400) Train Loss: 0.1470, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 07:39:18] (step=0391500) Train Loss: 0.1468, Train Steps/Sec: 9.74, Grad Norm: 0.0059 +[2025-02-21 07:39:29] (step=0391600) Train Loss: 0.1466, Train Steps/Sec: 9.45, Grad Norm: 0.0060 +[2025-02-21 07:39:38] (step=0391700) Train Loss: 0.1459, Train Steps/Sec: 10.96, Grad Norm: 0.0057 +[2025-02-21 07:39:46] (step=0391800) Train Loss: 0.1468, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 07:39:54] (step=0391900) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:40:02] (step=0392000) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 07:40:10] (step=0392100) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:40:18] (step=0392200) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:40:26] (step=0392300) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 07:40:34] (step=0392400) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 07:40:43] (step=0392500) Train Loss: 0.1468, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 07:40:52] (step=0392600) Train Loss: 0.1470, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 07:41:00] (step=0392700) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 07:41:08] (step=0392800) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 07:41:17] (step=0392900) Train Loss: 0.1481, Train Steps/Sec: 10.90, Grad Norm: 0.0058 +[2025-02-21 07:41:25] (step=0393000) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 07:41:33] (step=0393100) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:41:41] (step=0393200) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:41:49] (step=0393300) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:41:57] (step=0393400) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 07:42:05] (step=0393500) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 07:42:13] (step=0393600) Train Loss: 0.1458, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 07:42:23] (step=0393700) Train Loss: 0.1464, Train Steps/Sec: 10.35, Grad Norm: 0.0059 +[2025-02-21 07:42:33] (step=0393800) Train Loss: 0.1480, Train Steps/Sec: 10.00, Grad Norm: 0.0064 +[2025-02-21 07:42:42] (step=0393900) Train Loss: 0.1470, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 07:42:50] (step=0394000) Train Loss: 0.1473, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 07:42:59] (step=0394100) Train Loss: 0.1475, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-21 07:43:07] (step=0394200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 07:43:15] (step=0394300) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 07:43:23] (step=0394400) Train Loss: 0.1483, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 07:43:31] (step=0394500) Train Loss: 0.1455, Train Steps/Sec: 12.65, Grad Norm: 0.0065 +[2025-02-21 07:43:39] (step=0394600) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 07:43:48] (step=0394700) Train Loss: 0.1469, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 07:43:56] (step=0394800) Train Loss: 0.1461, Train Steps/Sec: 11.38, Grad Norm: 0.0062 +[2025-02-21 07:44:05] (step=0394900) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 07:44:13] (step=0395000) Train Loss: 0.1461, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 07:44:21] (step=0395100) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 07:44:29] (step=0395200) Train Loss: 0.1491, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 07:44:37] (step=0395300) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 07:44:46] (step=0395400) Train Loss: 0.1475, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-21 07:44:54] (step=0395500) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:45:01] (step=0395600) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:45:10] (step=0395700) Train Loss: 0.1463, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 07:45:19] (step=0395800) Train Loss: 0.1463, Train Steps/Sec: 10.71, Grad Norm: 0.0060 +[2025-02-21 07:45:29] (step=0395900) Train Loss: 0.1468, Train Steps/Sec: 10.03, Grad Norm: 0.0060 +[2025-02-21 07:45:38] (step=0396000) Train Loss: 0.1472, Train Steps/Sec: 11.04, Grad Norm: 0.0062 +[2025-02-21 07:45:47] (step=0396100) Train Loss: 0.1461, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 07:45:55] (step=0396200) Train Loss: 0.1488, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 07:46:03] (step=0396300) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:46:11] (step=0396400) Train Loss: 0.1452, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 07:46:19] (step=0396500) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 07:46:28] (step=0396600) Train Loss: 0.1465, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-21 07:46:36] (step=0396700) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 07:46:44] (step=0396800) Train Loss: 0.1478, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 07:46:52] (step=0396900) Train Loss: 0.1474, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 07:47:01] (step=0397000) Train Loss: 0.1468, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 07:47:09] (step=0397100) Train Loss: 0.1477, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 07:47:17] (step=0397200) Train Loss: 0.1469, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 07:47:25] (step=0397300) Train Loss: 0.1456, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 07:47:33] (step=0397400) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 07:47:41] (step=0397500) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 07:47:49] (step=0397600) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:47:57] (step=0397700) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 07:48:05] (step=0397800) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 07:48:15] (step=0397900) Train Loss: 0.1467, Train Steps/Sec: 10.32, Grad Norm: 0.0059 +[2025-02-21 07:48:24] (step=0398000) Train Loss: 0.1460, Train Steps/Sec: 11.00, Grad Norm: 0.0057 +[2025-02-21 07:48:35] (step=0398100) Train Loss: 0.1468, Train Steps/Sec: 9.23, Grad Norm: 0.0063 +[2025-02-21 07:48:44] (step=0398200) Train Loss: 0.1469, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 07:48:52] (step=0398300) Train Loss: 0.1455, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 07:49:00] (step=0398400) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:49:08] (step=0398500) Train Loss: 0.1464, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 07:49:16] (step=0398600) Train Loss: 0.1484, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 07:49:24] (step=0398700) Train Loss: 0.1477, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 07:49:32] (step=0398800) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 07:49:40] (step=0398900) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 07:49:48] (step=0399000) Train Loss: 0.1482, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 07:49:58] (step=0399100) Train Loss: 0.1462, Train Steps/Sec: 10.27, Grad Norm: 0.0063 +[2025-02-21 07:50:07] (step=0399200) Train Loss: 0.1477, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 07:50:15] (step=0399300) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 07:50:23] (step=0399400) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 07:50:31] (step=0399500) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:50:39] (step=0399600) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:50:47] (step=0399700) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 07:50:55] (step=0399800) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:51:03] (step=0399900) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 07:51:11] (step=0400000) Train Loss: 0.1472, Train Steps/Sec: 12.21, Grad Norm: 0.0059 +[2025-02-21 07:51:15] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0400000.pt +[2025-02-21 07:51:24] (step=0400100) Train Loss: 0.1460, Train Steps/Sec: 7.45, Grad Norm: 0.0061 +[2025-02-21 07:51:34] (step=0400200) Train Loss: 0.1452, Train Steps/Sec: 10.34, Grad Norm: 0.0058 +[2025-02-21 07:51:44] (step=0400300) Train Loss: 0.1470, Train Steps/Sec: 9.75, Grad Norm: 0.0062 +[2025-02-21 07:51:53] (step=0400400) Train Loss: 0.1454, Train Steps/Sec: 10.62, Grad Norm: 0.0062 +[2025-02-21 07:52:02] (step=0400500) Train Loss: 0.1469, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 07:52:10] (step=0400600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 07:52:18] (step=0400700) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 07:52:26] (step=0400800) Train Loss: 0.1462, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 07:52:34] (step=0400900) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 07:52:42] (step=0401000) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 07:52:50] (step=0401100) Train Loss: 0.1465, Train Steps/Sec: 12.65, Grad Norm: 0.0059 +[2025-02-21 07:52:58] (step=0401200) Train Loss: 0.1471, Train Steps/Sec: 11.82, Grad Norm: 0.0059 +[2025-02-21 07:53:07] (step=0401300) Train Loss: 0.1476, Train Steps/Sec: 11.40, Grad Norm: 0.0062 +[2025-02-21 07:53:16] (step=0401400) Train Loss: 0.1470, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-21 07:53:24] (step=0401500) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 07:53:33] (step=0401600) Train Loss: 0.1471, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-21 07:53:41] (step=0401700) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 07:53:49] (step=0401800) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 07:53:57] (step=0401900) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 07:54:04] (step=0402000) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 07:54:12] (step=0402100) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 07:54:21] (step=0402200) Train Loss: 0.1477, Train Steps/Sec: 11.42, Grad Norm: 0.0059 +[2025-02-21 07:54:30] (step=0402300) Train Loss: 0.1456, Train Steps/Sec: 11.79, Grad Norm: 0.0058 +[2025-02-21 07:54:40] (step=0402400) Train Loss: 0.1468, Train Steps/Sec: 9.27, Grad Norm: 0.0060 +[2025-02-21 07:54:50] (step=0402500) Train Loss: 0.1464, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 07:54:58] (step=0402600) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 07:55:06] (step=0402700) Train Loss: 0.1460, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 07:55:14] (step=0402800) Train Loss: 0.1469, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 07:55:23] (step=0402900) Train Loss: 0.1461, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 07:55:31] (step=0403000) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 07:55:39] (step=0403100) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 07:55:47] (step=0403200) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 07:55:55] (step=0403300) Train Loss: 0.1461, Train Steps/Sec: 12.65, Grad Norm: 0.0058 +[2025-02-21 07:56:03] (step=0403400) Train Loss: 0.1472, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 07:56:12] (step=0403500) Train Loss: 0.1467, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 07:56:21] (step=0403600) Train Loss: 0.1472, Train Steps/Sec: 11.31, Grad Norm: 0.0058 +[2025-02-21 07:56:29] (step=0403700) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 07:56:37] (step=0403800) Train Loss: 0.1459, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 07:56:45] (step=0403900) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 07:56:53] (step=0404000) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 07:57:02] (step=0404100) Train Loss: 0.1466, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 07:57:10] (step=0404200) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 07:57:18] (step=0404300) Train Loss: 0.1474, Train Steps/Sec: 11.76, Grad Norm: 0.0062 +[2025-02-21 07:57:27] (step=0404400) Train Loss: 0.1465, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 07:57:36] (step=0404500) Train Loss: 0.1454, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 07:57:46] (step=0404600) Train Loss: 0.1463, Train Steps/Sec: 9.47, Grad Norm: 0.0058 +[2025-02-21 07:57:55] (step=0404700) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 07:58:03] (step=0404800) Train Loss: 0.1460, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 07:58:12] (step=0404900) Train Loss: 0.1480, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 07:58:20] (step=0405000) Train Loss: 0.1449, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:58:28] (step=0405100) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:58:36] (step=0405200) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:58:44] (step=0405300) Train Loss: 0.1468, Train Steps/Sec: 12.53, Grad Norm: 0.0059 +[2025-02-21 07:58:53] (step=0405400) Train Loss: 0.1480, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-21 07:59:00] (step=0405500) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 07:59:10] (step=0405600) Train Loss: 0.1469, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 07:59:18] (step=0405700) Train Loss: 0.1481, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 07:59:27] (step=0405800) Train Loss: 0.1460, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 07:59:35] (step=0405900) Train Loss: 0.1467, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 07:59:43] (step=0406000) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 07:59:51] (step=0406100) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 07:59:59] (step=0406200) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 08:00:07] (step=0406300) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 08:00:15] (step=0406400) Train Loss: 0.1470, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 08:00:23] (step=0406500) Train Loss: 0.1477, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 08:00:33] (step=0406600) Train Loss: 0.1461, Train Steps/Sec: 9.90, Grad Norm: 0.0060 +[2025-02-21 08:00:42] (step=0406700) Train Loss: 0.1470, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 08:00:52] (step=0406800) Train Loss: 0.1460, Train Steps/Sec: 10.05, Grad Norm: 0.0059 +[2025-02-21 08:01:01] (step=0406900) Train Loss: 0.1470, Train Steps/Sec: 11.39, Grad Norm: 0.0061 +[2025-02-21 08:01:09] (step=0407000) Train Loss: 0.1467, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 08:01:18] (step=0407100) Train Loss: 0.1462, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 08:01:26] (step=0407200) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 08:01:34] (step=0407300) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 08:01:42] (step=0407400) Train Loss: 0.1460, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 08:01:50] (step=0407500) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 08:01:57] (step=0407600) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 08:02:05] (step=0407700) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 08:02:14] (step=0407800) Train Loss: 0.1473, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 08:02:24] (step=0407900) Train Loss: 0.1480, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-21 08:02:32] (step=0408000) Train Loss: 0.1477, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 08:02:41] (step=0408100) Train Loss: 0.1470, Train Steps/Sec: 12.10, Grad Norm: 0.0058 +[2025-02-21 08:02:49] (step=0408200) Train Loss: 0.1469, Train Steps/Sec: 12.54, Grad Norm: 0.0057 +[2025-02-21 08:02:57] (step=0408300) Train Loss: 0.1477, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 08:03:05] (step=0408400) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:03:13] (step=0408500) Train Loss: 0.1474, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 08:03:21] (step=0408600) Train Loss: 0.1462, Train Steps/Sec: 11.31, Grad Norm: 0.0064 +[2025-02-21 08:03:30] (step=0408700) Train Loss: 0.1459, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 08:03:39] (step=0408800) Train Loss: 0.1459, Train Steps/Sec: 10.98, Grad Norm: 0.0057 +[2025-02-21 08:03:48] (step=0408900) Train Loss: 0.1459, Train Steps/Sec: 11.67, Grad Norm: 0.0058 +[2025-02-21 08:03:57] (step=0409000) Train Loss: 0.1469, Train Steps/Sec: 10.27, Grad Norm: 0.0067 +[2025-02-21 08:04:07] (step=0409100) Train Loss: 0.1471, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-21 08:04:16] (step=0409200) Train Loss: 0.1467, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 08:04:24] (step=0409300) Train Loss: 0.1452, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 08:04:32] (step=0409400) Train Loss: 0.1454, Train Steps/Sec: 12.65, Grad Norm: 0.0062 +[2025-02-21 08:04:40] (step=0409500) Train Loss: 0.1464, Train Steps/Sec: 12.65, Grad Norm: 0.0065 +[2025-02-21 08:04:48] (step=0409600) Train Loss: 0.1454, Train Steps/Sec: 12.66, Grad Norm: 0.0060 +[2025-02-21 08:04:56] (step=0409700) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 08:05:04] (step=0409800) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:05:11] (step=0409900) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 08:05:20] (step=0410000) Train Loss: 0.1468, Train Steps/Sec: 11.04, Grad Norm: 0.0058 +[2025-02-21 08:05:29] (step=0410100) Train Loss: 0.1466, Train Steps/Sec: 11.76, Grad Norm: 0.0065 +[2025-02-21 08:05:38] (step=0410200) Train Loss: 0.1476, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 08:05:46] (step=0410300) Train Loss: 0.1472, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 08:05:55] (step=0410400) Train Loss: 0.1479, Train Steps/Sec: 11.32, Grad Norm: 0.0063 +[2025-02-21 08:06:03] (step=0410500) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:06:10] (step=0410600) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 08:06:19] (step=0410700) Train Loss: 0.1474, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 08:06:27] (step=0410800) Train Loss: 0.1461, Train Steps/Sec: 11.42, Grad Norm: 0.0058 +[2025-02-21 08:06:36] (step=0410900) Train Loss: 0.1467, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 08:06:45] (step=0411000) Train Loss: 0.1460, Train Steps/Sec: 11.36, Grad Norm: 0.0057 +[2025-02-21 08:06:54] (step=0411100) Train Loss: 0.1468, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 08:07:03] (step=0411200) Train Loss: 0.1469, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 08:07:12] (step=0411300) Train Loss: 0.1467, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 08:07:20] (step=0411400) Train Loss: 0.1467, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 08:07:29] (step=0411500) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 08:07:37] (step=0411600) Train Loss: 0.1477, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 08:07:45] (step=0411700) Train Loss: 0.1463, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 08:07:53] (step=0411800) Train Loss: 0.1443, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 08:08:01] (step=0411900) Train Loss: 0.1474, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 08:08:09] (step=0412000) Train Loss: 0.1478, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 08:08:18] (step=0412100) Train Loss: 0.1473, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 08:08:26] (step=0412200) Train Loss: 0.1466, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 08:08:35] (step=0412300) Train Loss: 0.1470, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 08:08:44] (step=0412400) Train Loss: 0.1473, Train Steps/Sec: 12.04, Grad Norm: 0.0059 +[2025-02-21 08:08:52] (step=0412500) Train Loss: 0.1452, Train Steps/Sec: 12.04, Grad Norm: 0.0062 +[2025-02-21 08:09:00] (step=0412600) Train Loss: 0.1457, Train Steps/Sec: 12.52, Grad Norm: 0.0058 +[2025-02-21 08:09:08] (step=0412700) Train Loss: 0.1468, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 08:09:16] (step=0412800) Train Loss: 0.1471, Train Steps/Sec: 12.02, Grad Norm: 0.0060 +[2025-02-21 08:09:26] (step=0412900) Train Loss: 0.1475, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-21 08:09:34] (step=0413000) Train Loss: 0.1467, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 08:09:43] (step=0413100) Train Loss: 0.1464, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 08:09:51] (step=0413200) Train Loss: 0.1475, Train Steps/Sec: 12.20, Grad Norm: 0.0061 +[2025-02-21 08:10:00] (step=0413300) Train Loss: 0.1473, Train Steps/Sec: 10.68, Grad Norm: 0.0062 +[2025-02-21 08:10:10] (step=0413400) Train Loss: 0.1468, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 08:10:18] (step=0413500) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 08:10:27] (step=0413600) Train Loss: 0.1467, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-21 08:10:35] (step=0413700) Train Loss: 0.1464, Train Steps/Sec: 12.08, Grad Norm: 0.0058 +[2025-02-21 08:10:43] (step=0413800) Train Loss: 0.1478, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 08:10:51] (step=0413900) Train Loss: 0.1465, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 08:10:59] (step=0414000) Train Loss: 0.1476, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 08:11:08] (step=0414100) Train Loss: 0.1473, Train Steps/Sec: 11.19, Grad Norm: 0.0057 +[2025-02-21 08:11:16] (step=0414200) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:11:24] (step=0414300) Train Loss: 0.1467, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 08:11:33] (step=0414400) Train Loss: 0.1462, Train Steps/Sec: 11.73, Grad Norm: 0.0057 +[2025-02-21 08:11:42] (step=0414500) Train Loss: 0.1462, Train Steps/Sec: 11.32, Grad Norm: 0.0063 +[2025-02-21 08:11:50] (step=0414600) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 08:11:58] (step=0414700) Train Loss: 0.1469, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 08:12:06] (step=0414800) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:12:14] (step=0414900) Train Loss: 0.1456, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 08:12:23] (step=0415000) Train Loss: 0.1461, Train Steps/Sec: 11.37, Grad Norm: 0.0064 +[2025-02-21 08:12:31] (step=0415100) Train Loss: 0.1466, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 08:12:40] (step=0415200) Train Loss: 0.1470, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 08:12:48] (step=0415300) Train Loss: 0.1470, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 08:12:57] (step=0415400) Train Loss: 0.1464, Train Steps/Sec: 10.86, Grad Norm: 0.0059 +[2025-02-21 08:13:07] (step=0415500) Train Loss: 0.1476, Train Steps/Sec: 10.31, Grad Norm: 0.0063 +[2025-02-21 08:13:17] (step=0415600) Train Loss: 0.1456, Train Steps/Sec: 10.60, Grad Norm: 0.0057 +[2025-02-21 08:13:25] (step=0415700) Train Loss: 0.1474, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 08:13:33] (step=0415800) Train Loss: 0.1467, Train Steps/Sec: 12.09, Grad Norm: 0.0064 +[2025-02-21 08:13:41] (step=0415900) Train Loss: 0.1455, Train Steps/Sec: 12.12, Grad Norm: 0.0058 +[2025-02-21 08:13:49] (step=0416000) Train Loss: 0.1458, Train Steps/Sec: 12.47, Grad Norm: 0.0057 +[2025-02-21 08:13:57] (step=0416100) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:14:05] (step=0416200) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:14:13] (step=0416300) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:14:21] (step=0416400) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 08:14:29] (step=0416500) Train Loss: 0.1470, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 08:14:39] (step=0416600) Train Loss: 0.1477, Train Steps/Sec: 10.23, Grad Norm: 0.0060 +[2025-02-21 08:14:48] (step=0416700) Train Loss: 0.1465, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 08:14:56] (step=0416800) Train Loss: 0.1454, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-21 08:15:04] (step=0416900) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 08:15:12] (step=0417000) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 08:15:20] (step=0417100) Train Loss: 0.1468, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 08:15:29] (step=0417200) Train Loss: 0.1471, Train Steps/Sec: 11.41, Grad Norm: 0.0060 +[2025-02-21 08:15:38] (step=0417300) Train Loss: 0.1475, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 08:15:46] (step=0417400) Train Loss: 0.1460, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 08:15:54] (step=0417500) Train Loss: 0.1472, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 08:16:02] (step=0417600) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 08:16:12] (step=0417700) Train Loss: 0.1460, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 08:16:22] (step=0417800) Train Loss: 0.1470, Train Steps/Sec: 10.62, Grad Norm: 0.0062 +[2025-02-21 08:16:31] (step=0417900) Train Loss: 0.1464, Train Steps/Sec: 10.72, Grad Norm: 0.0057 +[2025-02-21 08:16:39] (step=0418000) Train Loss: 0.1469, Train Steps/Sec: 12.13, Grad Norm: 0.0064 +[2025-02-21 08:16:47] (step=0418100) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 08:16:55] (step=0418200) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 08:17:03] (step=0418300) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 08:17:11] (step=0418400) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:17:19] (step=0418500) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 08:17:27] (step=0418600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:17:35] (step=0418700) Train Loss: 0.1460, Train Steps/Sec: 11.79, Grad Norm: 0.0058 +[2025-02-21 08:17:44] (step=0418800) Train Loss: 0.1466, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 08:17:53] (step=0418900) Train Loss: 0.1470, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 08:18:01] (step=0419000) Train Loss: 0.1475, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 08:18:10] (step=0419100) Train Loss: 0.1471, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-21 08:18:18] (step=0419200) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 08:18:27] (step=0419300) Train Loss: 0.1480, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 08:18:36] (step=0419400) Train Loss: 0.1462, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 08:18:44] (step=0419500) Train Loss: 0.1465, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 08:18:52] (step=0419600) Train Loss: 0.1464, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 08:19:01] (step=0419700) Train Loss: 0.1459, Train Steps/Sec: 12.04, Grad Norm: 0.0061 +[2025-02-21 08:19:09] (step=0419800) Train Loss: 0.1472, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 08:19:18] (step=0419900) Train Loss: 0.1472, Train Steps/Sec: 10.66, Grad Norm: 0.0062 +[2025-02-21 08:19:28] (step=0420000) Train Loss: 0.1465, Train Steps/Sec: 10.63, Grad Norm: 0.0058 +[2025-02-21 08:19:30] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0420000.pt +[2025-02-21 08:19:38] (step=0420100) Train Loss: 0.1460, Train Steps/Sec: 9.49, Grad Norm: 0.0061 +[2025-02-21 08:19:47] (step=0420200) Train Loss: 0.1470, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 08:19:55] (step=0420300) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 08:20:04] (step=0420400) Train Loss: 0.1469, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-21 08:20:12] (step=0420500) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 08:20:19] (step=0420600) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 08:20:27] (step=0420700) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 08:20:35] (step=0420800) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:20:44] (step=0420900) Train Loss: 0.1460, Train Steps/Sec: 11.78, Grad Norm: 0.0058 +[2025-02-21 08:20:52] (step=0421000) Train Loss: 0.1464, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 08:21:01] (step=0421100) Train Loss: 0.1466, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 08:21:10] (step=0421200) Train Loss: 0.1470, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 08:21:18] (step=0421300) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:21:26] (step=0421400) Train Loss: 0.1464, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 08:21:35] (step=0421500) Train Loss: 0.1466, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 08:21:44] (step=0421600) Train Loss: 0.1480, Train Steps/Sec: 10.20, Grad Norm: 0.0061 +[2025-02-21 08:21:53] (step=0421700) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 08:22:01] (step=0421800) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:22:09] (step=0421900) Train Loss: 0.1462, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 08:22:17] (step=0422000) Train Loss: 0.1450, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 08:22:27] (step=0422100) Train Loss: 0.1468, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-21 08:22:36] (step=0422200) Train Loss: 0.1476, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 08:22:44] (step=0422300) Train Loss: 0.1459, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 08:22:53] (step=0422400) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 08:23:01] (step=0422500) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 08:23:08] (step=0422600) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 08:23:16] (step=0422700) Train Loss: 0.1458, Train Steps/Sec: 12.51, Grad Norm: 0.0062 +[2025-02-21 08:23:24] (step=0422800) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:23:33] (step=0422900) Train Loss: 0.1461, Train Steps/Sec: 11.13, Grad Norm: 0.0059 +[2025-02-21 08:23:41] (step=0423000) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:23:50] (step=0423100) Train Loss: 0.1461, Train Steps/Sec: 10.95, Grad Norm: 0.0063 +[2025-02-21 08:23:58] (step=0423200) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:24:07] (step=0423300) Train Loss: 0.1476, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 08:24:16] (step=0423400) Train Loss: 0.1465, Train Steps/Sec: 11.68, Grad Norm: 0.0064 +[2025-02-21 08:24:24] (step=0423500) Train Loss: 0.1464, Train Steps/Sec: 12.05, Grad Norm: 0.0061 +[2025-02-21 08:24:33] (step=0423600) Train Loss: 0.1466, Train Steps/Sec: 11.77, Grad Norm: 0.0062 +[2025-02-21 08:24:41] (step=0423700) Train Loss: 0.1462, Train Steps/Sec: 11.32, Grad Norm: 0.0063 +[2025-02-21 08:24:50] (step=0423800) Train Loss: 0.1468, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 08:24:58] (step=0423900) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:25:06] (step=0424000) Train Loss: 0.1461, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 08:25:15] (step=0424100) Train Loss: 0.1458, Train Steps/Sec: 10.74, Grad Norm: 0.0057 +[2025-02-21 08:25:24] (step=0424200) Train Loss: 0.1472, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 08:25:34] (step=0424300) Train Loss: 0.1460, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-21 08:25:42] (step=0424400) Train Loss: 0.1465, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-21 08:25:51] (step=0424500) Train Loss: 0.1458, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-21 08:25:59] (step=0424600) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 08:26:07] (step=0424700) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 08:26:15] (step=0424800) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 08:26:23] (step=0424900) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 08:26:31] (step=0425000) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:26:39] (step=0425100) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 08:26:47] (step=0425200) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:26:56] (step=0425300) Train Loss: 0.1468, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 08:27:05] (step=0425400) Train Loss: 0.1474, Train Steps/Sec: 10.77, Grad Norm: 0.0059 +[2025-02-21 08:27:14] (step=0425500) Train Loss: 0.1460, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 08:27:22] (step=0425600) Train Loss: 0.1464, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 08:27:31] (step=0425700) Train Loss: 0.1454, Train Steps/Sec: 11.39, Grad Norm: 0.0066 +[2025-02-21 08:27:39] (step=0425800) Train Loss: 0.1480, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 08:27:48] (step=0425900) Train Loss: 0.1465, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 08:27:56] (step=0426000) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:28:04] (step=0426100) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:28:12] (step=0426200) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:28:20] (step=0426300) Train Loss: 0.1474, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 08:28:29] (step=0426400) Train Loss: 0.1469, Train Steps/Sec: 11.02, Grad Norm: 0.0065 +[2025-02-21 08:28:39] (step=0426500) Train Loss: 0.1466, Train Steps/Sec: 10.68, Grad Norm: 0.0059 +[2025-02-21 08:28:49] (step=0426600) Train Loss: 0.1468, Train Steps/Sec: 10.12, Grad Norm: 0.0063 +[2025-02-21 08:28:57] (step=0426700) Train Loss: 0.1476, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 08:29:05] (step=0426800) Train Loss: 0.1473, Train Steps/Sec: 12.13, Grad Norm: 0.0058 +[2025-02-21 08:29:13] (step=0426900) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 08:29:21] (step=0427000) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 08:29:29] (step=0427100) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 08:29:37] (step=0427200) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:29:45] (step=0427300) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:29:53] (step=0427400) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 08:30:02] (step=0427500) Train Loss: 0.1470, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 08:30:10] (step=0427600) Train Loss: 0.1472, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 08:30:18] (step=0427700) Train Loss: 0.1468, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 08:30:28] (step=0427800) Train Loss: 0.1474, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 08:30:37] (step=0427900) Train Loss: 0.1469, Train Steps/Sec: 10.18, Grad Norm: 0.0059 +[2025-02-21 08:30:46] (step=0428000) Train Loss: 0.1455, Train Steps/Sec: 11.78, Grad Norm: 0.0062 +[2025-02-21 08:30:54] (step=0428100) Train Loss: 0.1479, Train Steps/Sec: 12.15, Grad Norm: 0.0066 +[2025-02-21 08:31:02] (step=0428200) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:31:10] (step=0428300) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:31:18] (step=0428400) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 08:31:26] (step=0428500) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 08:31:35] (step=0428600) Train Loss: 0.1461, Train Steps/Sec: 11.01, Grad Norm: 0.0060 +[2025-02-21 08:31:45] (step=0428700) Train Loss: 0.1458, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 08:31:54] (step=0428800) Train Loss: 0.1465, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 08:32:02] (step=0428900) Train Loss: 0.1466, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 08:32:10] (step=0429000) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 08:32:19] (step=0429100) Train Loss: 0.1464, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-21 08:32:27] (step=0429200) Train Loss: 0.1476, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 08:32:35] (step=0429300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 08:32:43] (step=0429400) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:32:51] (step=0429500) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 08:32:59] (step=0429600) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 08:33:08] (step=0429700) Train Loss: 0.1466, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 08:33:16] (step=0429800) Train Loss: 0.1478, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 08:33:25] (step=0429900) Train Loss: 0.1455, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 08:33:34] (step=0430000) Train Loss: 0.1463, Train Steps/Sec: 11.42, Grad Norm: 0.0059 +[2025-02-21 08:33:43] (step=0430100) Train Loss: 0.1455, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 08:33:51] (step=0430200) Train Loss: 0.1472, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 08:33:59] (step=0430300) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:34:08] (step=0430400) Train Loss: 0.1461, Train Steps/Sec: 11.14, Grad Norm: 0.0059 +[2025-02-21 08:34:16] (step=0430500) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 08:34:24] (step=0430600) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 08:34:32] (step=0430700) Train Loss: 0.1480, Train Steps/Sec: 12.09, Grad Norm: 0.0060 +[2025-02-21 08:34:42] (step=0430800) Train Loss: 0.1458, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 08:34:51] (step=0430900) Train Loss: 0.1467, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 08:35:00] (step=0431000) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 08:35:08] (step=0431100) Train Loss: 0.1454, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 08:35:16] (step=0431200) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 08:35:24] (step=0431300) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 08:35:32] (step=0431400) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:35:40] (step=0431500) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 08:35:49] (step=0431600) Train Loss: 0.1459, Train Steps/Sec: 11.13, Grad Norm: 0.0062 +[2025-02-21 08:35:57] (step=0431700) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 08:36:05] (step=0431800) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 08:36:14] (step=0431900) Train Loss: 0.1458, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 08:36:22] (step=0432000) Train Loss: 0.1447, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 08:36:32] (step=0432100) Train Loss: 0.1473, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 08:36:40] (step=0432200) Train Loss: 0.1459, Train Steps/Sec: 11.32, Grad Norm: 0.0061 +[2025-02-21 08:36:49] (step=0432300) Train Loss: 0.1466, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 08:36:57] (step=0432400) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:37:05] (step=0432500) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 08:37:13] (step=0432600) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 08:37:21] (step=0432700) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:37:29] (step=0432800) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:37:38] (step=0432900) Train Loss: 0.1470, Train Steps/Sec: 10.45, Grad Norm: 0.0061 +[2025-02-21 08:37:48] (step=0433000) Train Loss: 0.1451, Train Steps/Sec: 10.37, Grad Norm: 0.0062 +[2025-02-21 08:37:57] (step=0433100) Train Loss: 0.1467, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 08:38:05] (step=0433200) Train Loss: 0.1461, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 08:38:14] (step=0433300) Train Loss: 0.1468, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-21 08:38:22] (step=0433400) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 08:38:30] (step=0433500) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:38:38] (step=0433600) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 08:38:46] (step=0433700) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:38:54] (step=0433800) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 08:39:02] (step=0433900) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 08:39:10] (step=0434000) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 08:39:20] (step=0434100) Train Loss: 0.1458, Train Steps/Sec: 9.90, Grad Norm: 0.0059 +[2025-02-21 08:39:29] (step=0434200) Train Loss: 0.1465, Train Steps/Sec: 10.95, Grad Norm: 0.0058 +[2025-02-21 08:39:38] (step=0434300) Train Loss: 0.1457, Train Steps/Sec: 10.61, Grad Norm: 0.0059 +[2025-02-21 08:39:47] (step=0434400) Train Loss: 0.1478, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 08:39:55] (step=0434500) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 08:40:03] (step=0434600) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 08:40:11] (step=0434700) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:40:19] (step=0434800) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:40:27] (step=0434900) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 08:40:35] (step=0435000) Train Loss: 0.1477, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 08:40:44] (step=0435100) Train Loss: 0.1461, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 08:40:53] (step=0435200) Train Loss: 0.1471, Train Steps/Sec: 10.68, Grad Norm: 0.0062 +[2025-02-21 08:41:02] (step=0435300) Train Loss: 0.1461, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 08:41:12] (step=0435400) Train Loss: 0.1458, Train Steps/Sec: 10.47, Grad Norm: 0.0061 +[2025-02-21 08:41:20] (step=0435500) Train Loss: 0.1473, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 08:41:28] (step=0435600) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:41:36] (step=0435700) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 08:41:44] (step=0435800) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:41:52] (step=0435900) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:42:00] (step=0436000) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:42:08] (step=0436100) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 08:42:16] (step=0436200) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:42:25] (step=0436300) Train Loss: 0.1466, Train Steps/Sec: 10.65, Grad Norm: 0.0059 +[2025-02-21 08:42:34] (step=0436400) Train Loss: 0.1482, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 08:42:44] (step=0436500) Train Loss: 0.1465, Train Steps/Sec: 10.29, Grad Norm: 0.0058 +[2025-02-21 08:42:53] (step=0436600) Train Loss: 0.1464, Train Steps/Sec: 10.46, Grad Norm: 0.0062 +[2025-02-21 08:43:01] (step=0436700) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 08:43:09] (step=0436800) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 08:43:17] (step=0436900) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 08:43:25] (step=0437000) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 08:43:33] (step=0437100) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 08:43:41] (step=0437200) Train Loss: 0.1471, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 08:43:50] (step=0437300) Train Loss: 0.1461, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 08:43:59] (step=0437400) Train Loss: 0.1458, Train Steps/Sec: 10.71, Grad Norm: 0.0061 +[2025-02-21 08:44:08] (step=0437500) Train Loss: 0.1457, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 08:44:17] (step=0437600) Train Loss: 0.1473, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 08:44:25] (step=0437700) Train Loss: 0.1478, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 08:44:33] (step=0437800) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 08:44:42] (step=0437900) Train Loss: 0.1465, Train Steps/Sec: 11.12, Grad Norm: 0.0063 +[2025-02-21 08:44:50] (step=0438000) Train Loss: 0.1468, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 08:44:58] (step=0438100) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 08:45:06] (step=0438200) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 08:45:14] (step=0438300) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 08:45:22] (step=0438400) Train Loss: 0.1469, Train Steps/Sec: 12.19, Grad Norm: 0.0063 +[2025-02-21 08:45:31] (step=0438500) Train Loss: 0.1473, Train Steps/Sec: 10.40, Grad Norm: 0.0059 +[2025-02-21 08:45:41] (step=0438600) Train Loss: 0.1466, Train Steps/Sec: 11.01, Grad Norm: 0.0062 +[2025-02-21 08:45:50] (step=0438700) Train Loss: 0.1467, Train Steps/Sec: 10.24, Grad Norm: 0.0064 +[2025-02-21 08:45:58] (step=0438800) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 08:46:06] (step=0438900) Train Loss: 0.1467, Train Steps/Sec: 12.53, Grad Norm: 0.0062 +[2025-02-21 08:46:14] (step=0439000) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 08:46:22] (step=0439100) Train Loss: 0.1467, Train Steps/Sec: 12.49, Grad Norm: 0.0060 +[2025-02-21 08:46:31] (step=0439200) Train Loss: 0.1467, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-21 08:46:39] (step=0439300) Train Loss: 0.1485, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 08:46:47] (step=0439400) Train Loss: 0.1466, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 08:46:57] (step=0439500) Train Loss: 0.1459, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 08:47:06] (step=0439600) Train Loss: 0.1462, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-21 08:47:14] (step=0439700) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 08:47:23] (step=0439800) Train Loss: 0.1465, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-21 08:47:31] (step=0439900) Train Loss: 0.1472, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 08:47:39] (step=0440000) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 08:47:42] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0440000.pt +[2025-02-21 08:47:50] (step=0440100) Train Loss: 0.1461, Train Steps/Sec: 9.74, Grad Norm: 0.0061 +[2025-02-21 08:47:57] (step=0440200) Train Loss: 0.1451, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 08:48:05] (step=0440300) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 08:48:14] (step=0440400) Train Loss: 0.1472, Train Steps/Sec: 11.17, Grad Norm: 0.0058 +[2025-02-21 08:48:22] (step=0440500) Train Loss: 0.1472, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 08:48:31] (step=0440600) Train Loss: 0.1475, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 08:48:40] (step=0440700) Train Loss: 0.1469, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 08:48:50] (step=0440800) Train Loss: 0.1475, Train Steps/Sec: 9.71, Grad Norm: 0.0060 +[2025-02-21 08:48:59] (step=0440900) Train Loss: 0.1460, Train Steps/Sec: 11.76, Grad Norm: 0.0057 +[2025-02-21 08:49:06] (step=0441000) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 08:49:14] (step=0441100) Train Loss: 0.1486, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 08:49:22] (step=0441200) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 08:49:30] (step=0441300) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 08:49:38] (step=0441400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 08:49:46] (step=0441500) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 08:49:54] (step=0441600) Train Loss: 0.1458, Train Steps/Sec: 12.08, Grad Norm: 0.0061 +[2025-02-21 08:50:05] (step=0441700) Train Loss: 0.1459, Train Steps/Sec: 9.36, Grad Norm: 0.0059 +[2025-02-21 08:50:14] (step=0441800) Train Loss: 0.1474, Train Steps/Sec: 10.97, Grad Norm: 0.0058 +[2025-02-21 08:50:22] (step=0441900) Train Loss: 0.1471, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 08:50:31] (step=0442000) Train Loss: 0.1456, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 08:50:39] (step=0442100) Train Loss: 0.1450, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 08:50:47] (step=0442200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 08:50:55] (step=0442300) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 08:51:03] (step=0442400) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 08:51:11] (step=0442500) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 08:51:19] (step=0442600) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:51:27] (step=0442700) Train Loss: 0.1461, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 08:51:36] (step=0442800) Train Loss: 0.1463, Train Steps/Sec: 11.45, Grad Norm: 0.0063 +[2025-02-21 08:51:46] (step=0442900) Train Loss: 0.1460, Train Steps/Sec: 9.58, Grad Norm: 0.0060 +[2025-02-21 08:51:56] (step=0443000) Train Loss: 0.1467, Train Steps/Sec: 10.30, Grad Norm: 0.0062 +[2025-02-21 08:52:04] (step=0443100) Train Loss: 0.1462, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 08:52:12] (step=0443200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 08:52:20] (step=0443300) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 08:52:28] (step=0443400) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 08:52:36] (step=0443500) Train Loss: 0.1476, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 08:52:44] (step=0443600) Train Loss: 0.1484, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 08:52:52] (step=0443700) Train Loss: 0.1479, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 08:53:00] (step=0443800) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 08:53:10] (step=0443900) Train Loss: 0.1472, Train Steps/Sec: 10.33, Grad Norm: 0.0062 +[2025-02-21 08:53:19] (step=0444000) Train Loss: 0.1468, Train Steps/Sec: 10.61, Grad Norm: 0.0060 +[2025-02-21 08:53:27] (step=0444100) Train Loss: 0.1472, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 08:53:37] (step=0444200) Train Loss: 0.1455, Train Steps/Sec: 10.44, Grad Norm: 0.0060 +[2025-02-21 08:53:45] (step=0444300) Train Loss: 0.1475, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 08:53:53] (step=0444400) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 08:54:01] (step=0444500) Train Loss: 0.1463, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 08:54:09] (step=0444600) Train Loss: 0.1461, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 08:54:17] (step=0444700) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 08:54:25] (step=0444800) Train Loss: 0.1461, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 08:54:34] (step=0444900) Train Loss: 0.1464, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 08:54:42] (step=0445000) Train Loss: 0.1457, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 08:54:52] (step=0445100) Train Loss: 0.1469, Train Steps/Sec: 10.29, Grad Norm: 0.0064 +[2025-02-21 08:55:01] (step=0445200) Train Loss: 0.1464, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-21 08:55:09] (step=0445300) Train Loss: 0.1472, Train Steps/Sec: 12.08, Grad Norm: 0.0060 +[2025-02-21 08:55:18] (step=0445400) Train Loss: 0.1470, Train Steps/Sec: 11.18, Grad Norm: 0.0060 +[2025-02-21 08:55:26] (step=0445500) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 08:55:34] (step=0445600) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:55:42] (step=0445700) Train Loss: 0.1482, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 08:55:50] (step=0445800) Train Loss: 0.1454, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 08:55:58] (step=0445900) Train Loss: 0.1461, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 08:56:06] (step=0446000) Train Loss: 0.1480, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:56:16] (step=0446100) Train Loss: 0.1471, Train Steps/Sec: 10.35, Grad Norm: 0.0059 +[2025-02-21 08:56:25] (step=0446200) Train Loss: 0.1465, Train Steps/Sec: 10.63, Grad Norm: 0.0058 +[2025-02-21 08:56:34] (step=0446300) Train Loss: 0.1452, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-21 08:56:42] (step=0446400) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:56:50] (step=0446500) Train Loss: 0.1457, Train Steps/Sec: 12.13, Grad Norm: 0.0058 +[2025-02-21 08:56:58] (step=0446600) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 08:57:07] (step=0446700) Train Loss: 0.1466, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-21 08:57:15] (step=0446800) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 08:57:23] (step=0446900) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 08:57:31] (step=0447000) Train Loss: 0.1460, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 08:57:39] (step=0447100) Train Loss: 0.1474, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 08:57:49] (step=0447200) Train Loss: 0.1471, Train Steps/Sec: 10.70, Grad Norm: 0.0061 +[2025-02-21 08:57:58] (step=0447300) Train Loss: 0.1477, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 08:58:07] (step=0447400) Train Loss: 0.1473, Train Steps/Sec: 10.96, Grad Norm: 0.0062 +[2025-02-21 08:58:15] (step=0447500) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:58:23] (step=0447600) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 08:58:31] (step=0447700) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:58:39] (step=0447800) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 08:58:48] (step=0447900) Train Loss: 0.1473, Train Steps/Sec: 11.19, Grad Norm: 0.0062 +[2025-02-21 08:58:56] (step=0448000) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 08:59:04] (step=0448100) Train Loss: 0.1470, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 08:59:12] (step=0448200) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 08:59:22] (step=0448300) Train Loss: 0.1470, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-21 08:59:31] (step=0448400) Train Loss: 0.1464, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 08:59:39] (step=0448500) Train Loss: 0.1470, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 08:59:47] (step=0448600) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 08:59:56] (step=0448700) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 09:00:03] (step=0448800) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:00:11] (step=0448900) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:00:19] (step=0449000) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 09:00:28] (step=0449100) Train Loss: 0.1457, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 09:00:37] (step=0449200) Train Loss: 0.1476, Train Steps/Sec: 10.54, Grad Norm: 0.0061 +[2025-02-21 09:00:46] (step=0449300) Train Loss: 0.1454, Train Steps/Sec: 11.37, Grad Norm: 0.0063 +[2025-02-21 09:00:55] (step=0449400) Train Loss: 0.1456, Train Steps/Sec: 11.02, Grad Norm: 0.0056 +[2025-02-21 09:01:04] (step=0449500) Train Loss: 0.1452, Train Steps/Sec: 11.31, Grad Norm: 0.0057 +[2025-02-21 09:01:13] (step=0449600) Train Loss: 0.1471, Train Steps/Sec: 11.32, Grad Norm: 0.0064 +[2025-02-21 09:01:21] (step=0449700) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:01:28] (step=0449800) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:01:36] (step=0449900) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 09:01:44] (step=0450000) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 09:01:52] (step=0450100) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 09:02:00] (step=0450200) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 09:02:08] (step=0450300) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 09:02:18] (step=0450400) Train Loss: 0.1466, Train Steps/Sec: 10.48, Grad Norm: 0.0061 +[2025-02-21 09:02:28] (step=0450500) Train Loss: 0.1462, Train Steps/Sec: 10.00, Grad Norm: 0.0063 +[2025-02-21 09:02:36] (step=0450600) Train Loss: 0.1458, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 09:02:45] (step=0450700) Train Loss: 0.1461, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 09:02:53] (step=0450800) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 09:03:01] (step=0450900) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 09:03:09] (step=0451000) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 09:03:17] (step=0451100) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 09:03:25] (step=0451200) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 09:03:34] (step=0451300) Train Loss: 0.1457, Train Steps/Sec: 11.40, Grad Norm: 0.0062 +[2025-02-21 09:03:42] (step=0451400) Train Loss: 0.1458, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 09:03:51] (step=0451500) Train Loss: 0.1470, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 09:04:00] (step=0451600) Train Loss: 0.1473, Train Steps/Sec: 11.01, Grad Norm: 0.0059 +[2025-02-21 09:04:10] (step=0451700) Train Loss: 0.1461, Train Steps/Sec: 10.23, Grad Norm: 0.0060 +[2025-02-21 09:04:18] (step=0451800) Train Loss: 0.1454, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 09:04:26] (step=0451900) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:04:34] (step=0452000) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 09:04:42] (step=0452100) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:04:50] (step=0452200) Train Loss: 0.1478, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:04:58] (step=0452300) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:05:06] (step=0452400) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:05:14] (step=0452500) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 09:05:23] (step=0452600) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 09:05:33] (step=0452700) Train Loss: 0.1464, Train Steps/Sec: 10.34, Grad Norm: 0.0059 +[2025-02-21 09:05:41] (step=0452800) Train Loss: 0.1456, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 09:05:51] (step=0452900) Train Loss: 0.1471, Train Steps/Sec: 10.53, Grad Norm: 0.0060 +[2025-02-21 09:05:59] (step=0453000) Train Loss: 0.1460, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 09:06:07] (step=0453100) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:06:15] (step=0453200) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:06:23] (step=0453300) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:06:31] (step=0453400) Train Loss: 0.1466, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 09:06:39] (step=0453500) Train Loss: 0.1473, Train Steps/Sec: 11.83, Grad Norm: 0.0060 +[2025-02-21 09:06:48] (step=0453600) Train Loss: 0.1472, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 09:06:57] (step=0453700) Train Loss: 0.1465, Train Steps/Sec: 11.79, Grad Norm: 0.0061 +[2025-02-21 09:07:06] (step=0453800) Train Loss: 0.1473, Train Steps/Sec: 11.03, Grad Norm: 0.0062 +[2025-02-21 09:07:14] (step=0453900) Train Loss: 0.1472, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 09:07:23] (step=0454000) Train Loss: 0.1459, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 09:07:31] (step=0454100) Train Loss: 0.1477, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 09:07:40] (step=0454200) Train Loss: 0.1459, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-21 09:07:48] (step=0454300) Train Loss: 0.1478, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 09:07:56] (step=0454400) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:08:04] (step=0454500) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:08:12] (step=0454600) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 09:08:20] (step=0454700) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 09:08:29] (step=0454800) Train Loss: 0.1469, Train Steps/Sec: 11.06, Grad Norm: 0.0063 +[2025-02-21 09:08:39] (step=0454900) Train Loss: 0.1463, Train Steps/Sec: 10.06, Grad Norm: 0.0061 +[2025-02-21 09:08:47] (step=0455000) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:08:55] (step=0455100) Train Loss: 0.1464, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 09:09:04] (step=0455200) Train Loss: 0.1468, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 09:09:12] (step=0455300) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 09:09:20] (step=0455400) Train Loss: 0.1457, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-21 09:09:28] (step=0455500) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:09:37] (step=0455600) Train Loss: 0.1475, Train Steps/Sec: 11.40, Grad Norm: 0.0062 +[2025-02-21 09:09:46] (step=0455700) Train Loss: 0.1477, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 09:09:54] (step=0455800) Train Loss: 0.1462, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 09:10:02] (step=0455900) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:10:11] (step=0456000) Train Loss: 0.1471, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 09:10:20] (step=0456100) Train Loss: 0.1468, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 09:10:29] (step=0456200) Train Loss: 0.1470, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 09:10:37] (step=0456300) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 09:10:45] (step=0456400) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 09:10:53] (step=0456500) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 09:11:01] (step=0456600) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 09:11:09] (step=0456700) Train Loss: 0.1469, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 09:11:17] (step=0456800) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:11:26] (step=0456900) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 09:11:35] (step=0457000) Train Loss: 0.1478, Train Steps/Sec: 10.34, Grad Norm: 0.0062 +[2025-02-21 09:11:45] (step=0457100) Train Loss: 0.1463, Train Steps/Sec: 10.66, Grad Norm: 0.0061 +[2025-02-21 09:11:53] (step=0457200) Train Loss: 0.1461, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 09:12:01] (step=0457300) Train Loss: 0.1457, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 09:12:09] (step=0457400) Train Loss: 0.1475, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 09:12:17] (step=0457500) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:12:25] (step=0457600) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:12:34] (step=0457700) Train Loss: 0.1463, Train Steps/Sec: 11.40, Grad Norm: 0.0061 +[2025-02-21 09:12:42] (step=0457800) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 09:12:52] (step=0457900) Train Loss: 0.1457, Train Steps/Sec: 10.24, Grad Norm: 0.0059 +[2025-02-21 09:13:00] (step=0458000) Train Loss: 0.1468, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 09:13:08] (step=0458100) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 09:13:17] (step=0458200) Train Loss: 0.1474, Train Steps/Sec: 10.97, Grad Norm: 0.0060 +[2025-02-21 09:13:26] (step=0458300) Train Loss: 0.1453, Train Steps/Sec: 10.97, Grad Norm: 0.0063 +[2025-02-21 09:13:35] (step=0458400) Train Loss: 0.1462, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 09:13:42] (step=0458500) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:13:50] (step=0458600) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:13:58] (step=0458700) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:14:06] (step=0458800) Train Loss: 0.1474, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 09:14:14] (step=0458900) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 09:14:22] (step=0459000) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:14:30] (step=0459100) Train Loss: 0.1458, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 09:14:41] (step=0459200) Train Loss: 0.1469, Train Steps/Sec: 9.22, Grad Norm: 0.0061 +[2025-02-21 09:14:50] (step=0459300) Train Loss: 0.1461, Train Steps/Sec: 10.97, Grad Norm: 0.0064 +[2025-02-21 09:14:59] (step=0459400) Train Loss: 0.1459, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 09:15:07] (step=0459500) Train Loss: 0.1465, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 09:15:15] (step=0459600) Train Loss: 0.1465, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 09:15:23] (step=0459700) Train Loss: 0.1469, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 09:15:31] (step=0459800) Train Loss: 0.1477, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 09:15:40] (step=0459900) Train Loss: 0.1460, Train Steps/Sec: 11.81, Grad Norm: 0.0059 +[2025-02-21 09:15:48] (step=0460000) Train Loss: 0.1466, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 09:15:52] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0460000.pt +[2025-02-21 09:16:00] (step=0460100) Train Loss: 0.1465, Train Steps/Sec: 8.61, Grad Norm: 0.0069 +[2025-02-21 09:16:08] (step=0460200) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:16:16] (step=0460300) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:16:26] (step=0460400) Train Loss: 0.1470, Train Steps/Sec: 9.97, Grad Norm: 0.0063 +[2025-02-21 09:16:35] (step=0460500) Train Loss: 0.1469, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-21 09:16:43] (step=0460600) Train Loss: 0.1466, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 09:16:51] (step=0460700) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:16:59] (step=0460800) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 09:17:07] (step=0460900) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:17:15] (step=0461000) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 09:17:23] (step=0461100) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:17:31] (step=0461200) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 09:17:40] (step=0461300) Train Loss: 0.1460, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 09:17:49] (step=0461400) Train Loss: 0.1456, Train Steps/Sec: 10.37, Grad Norm: 0.0059 +[2025-02-21 09:17:58] (step=0461500) Train Loss: 0.1462, Train Steps/Sec: 10.98, Grad Norm: 0.0062 +[2025-02-21 09:18:06] (step=0461600) Train Loss: 0.1472, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 09:18:16] (step=0461700) Train Loss: 0.1456, Train Steps/Sec: 10.54, Grad Norm: 0.0060 +[2025-02-21 09:18:24] (step=0461800) Train Loss: 0.1472, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 09:18:32] (step=0461900) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 09:18:41] (step=0462000) Train Loss: 0.1480, Train Steps/Sec: 11.42, Grad Norm: 0.0063 +[2025-02-21 09:18:50] (step=0462100) Train Loss: 0.1465, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 09:18:58] (step=0462200) Train Loss: 0.1464, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 09:19:06] (step=0462300) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 09:19:14] (step=0462400) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:19:22] (step=0462500) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 09:19:31] (step=0462600) Train Loss: 0.1469, Train Steps/Sec: 10.63, Grad Norm: 0.0058 +[2025-02-21 09:19:40] (step=0462700) Train Loss: 0.1448, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 09:19:48] (step=0462800) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 09:19:57] (step=0462900) Train Loss: 0.1454, Train Steps/Sec: 11.25, Grad Norm: 0.0064 +[2025-02-21 09:20:05] (step=0463000) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 09:20:13] (step=0463100) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:20:21] (step=0463200) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 09:20:29] (step=0463300) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:20:37] (step=0463400) Train Loss: 0.1461, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 09:20:46] (step=0463500) Train Loss: 0.1468, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 09:20:55] (step=0463600) Train Loss: 0.1461, Train Steps/Sec: 10.70, Grad Norm: 0.0060 +[2025-02-21 09:21:04] (step=0463700) Train Loss: 0.1471, Train Steps/Sec: 11.02, Grad Norm: 0.0059 +[2025-02-21 09:21:13] (step=0463800) Train Loss: 0.1472, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 09:21:21] (step=0463900) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 09:21:29] (step=0464000) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 09:21:37] (step=0464100) Train Loss: 0.1459, Train Steps/Sec: 12.09, Grad Norm: 0.0062 +[2025-02-21 09:21:47] (step=0464200) Train Loss: 0.1465, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-21 09:21:56] (step=0464300) Train Loss: 0.1469, Train Steps/Sec: 11.41, Grad Norm: 0.0063 +[2025-02-21 09:22:04] (step=0464400) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 09:22:12] (step=0464500) Train Loss: 0.1476, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 09:22:20] (step=0464600) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 09:22:28] (step=0464700) Train Loss: 0.1467, Train Steps/Sec: 12.19, Grad Norm: 0.0057 +[2025-02-21 09:22:37] (step=0464800) Train Loss: 0.1469, Train Steps/Sec: 11.00, Grad Norm: 0.0062 +[2025-02-21 09:22:46] (step=0464900) Train Loss: 0.1464, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 09:22:54] (step=0465000) Train Loss: 0.1458, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 09:23:02] (step=0465100) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:23:10] (step=0465200) Train Loss: 0.1474, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 09:23:18] (step=0465300) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 09:23:27] (step=0465400) Train Loss: 0.1466, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 09:23:35] (step=0465500) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 09:23:43] (step=0465600) Train Loss: 0.1466, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 09:23:52] (step=0465700) Train Loss: 0.1462, Train Steps/Sec: 11.01, Grad Norm: 0.0062 +[2025-02-21 09:24:01] (step=0465800) Train Loss: 0.1461, Train Steps/Sec: 11.06, Grad Norm: 0.0061 +[2025-02-21 09:24:10] (step=0465900) Train Loss: 0.1465, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-21 09:24:18] (step=0466000) Train Loss: 0.1468, Train Steps/Sec: 11.69, Grad Norm: 0.0066 +[2025-02-21 09:24:27] (step=0466100) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 09:24:35] (step=0466200) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:24:43] (step=0466300) Train Loss: 0.1464, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 09:24:52] (step=0466400) Train Loss: 0.1474, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 09:25:01] (step=0466500) Train Loss: 0.1474, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 09:25:09] (step=0466600) Train Loss: 0.1465, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 09:25:17] (step=0466700) Train Loss: 0.1456, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-21 09:25:25] (step=0466800) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:25:34] (step=0466900) Train Loss: 0.1454, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 09:25:43] (step=0467000) Train Loss: 0.1471, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 09:25:51] (step=0467100) Train Loss: 0.1458, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 09:26:00] (step=0467200) Train Loss: 0.1462, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 09:26:08] (step=0467300) Train Loss: 0.1479, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:26:16] (step=0467400) Train Loss: 0.1451, Train Steps/Sec: 12.64, Grad Norm: 0.0065 +[2025-02-21 09:26:23] (step=0467500) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:26:31] (step=0467600) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:26:39] (step=0467700) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:26:48] (step=0467800) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 09:26:58] (step=0467900) Train Loss: 0.1479, Train Steps/Sec: 9.98, Grad Norm: 0.0063 +[2025-02-21 09:27:07] (step=0468000) Train Loss: 0.1470, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-21 09:27:15] (step=0468100) Train Loss: 0.1460, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 09:27:24] (step=0468200) Train Loss: 0.1474, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 09:27:32] (step=0468300) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 09:27:41] (step=0468400) Train Loss: 0.1457, Train Steps/Sec: 11.44, Grad Norm: 0.0059 +[2025-02-21 09:27:50] (step=0468500) Train Loss: 0.1464, Train Steps/Sec: 11.41, Grad Norm: 0.0060 +[2025-02-21 09:27:58] (step=0468600) Train Loss: 0.1463, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 09:28:06] (step=0468700) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 09:28:14] (step=0468800) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:28:22] (step=0468900) Train Loss: 0.1476, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 09:28:30] (step=0469000) Train Loss: 0.1478, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 09:28:38] (step=0469100) Train Loss: 0.1470, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 09:28:48] (step=0469200) Train Loss: 0.1457, Train Steps/Sec: 9.90, Grad Norm: 0.0059 +[2025-02-21 09:28:57] (step=0469300) Train Loss: 0.1454, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 09:29:05] (step=0469400) Train Loss: 0.1467, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 09:29:13] (step=0469500) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 09:29:21] (step=0469600) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:29:29] (step=0469700) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:29:37] (step=0469800) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:29:45] (step=0469900) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:29:53] (step=0470000) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0066 +[2025-02-21 09:30:03] (step=0470100) Train Loss: 0.1470, Train Steps/Sec: 10.70, Grad Norm: 0.0060 +[2025-02-21 09:30:12] (step=0470200) Train Loss: 0.1468, Train Steps/Sec: 10.63, Grad Norm: 0.0058 +[2025-02-21 09:30:20] (step=0470300) Train Loss: 0.1474, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 09:30:30] (step=0470400) Train Loss: 0.1460, Train Steps/Sec: 10.42, Grad Norm: 0.0061 +[2025-02-21 09:30:38] (step=0470500) Train Loss: 0.1466, Train Steps/Sec: 11.60, Grad Norm: 0.0063 +[2025-02-21 09:30:47] (step=0470600) Train Loss: 0.1450, Train Steps/Sec: 11.41, Grad Norm: 0.0060 +[2025-02-21 09:30:56] (step=0470700) Train Loss: 0.1461, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 09:31:04] (step=0470800) Train Loss: 0.1478, Train Steps/Sec: 12.22, Grad Norm: 0.0060 +[2025-02-21 09:31:12] (step=0470900) Train Loss: 0.1452, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 09:31:20] (step=0471000) Train Loss: 0.1449, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 09:31:28] (step=0471100) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 09:31:36] (step=0471200) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 09:31:45] (step=0471300) Train Loss: 0.1466, Train Steps/Sec: 11.38, Grad Norm: 0.0062 +[2025-02-21 09:31:53] (step=0471400) Train Loss: 0.1468, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 09:32:02] (step=0471500) Train Loss: 0.1466, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 09:32:10] (step=0471600) Train Loss: 0.1453, Train Steps/Sec: 12.08, Grad Norm: 0.0058 +[2025-02-21 09:32:19] (step=0471700) Train Loss: 0.1454, Train Steps/Sec: 11.28, Grad Norm: 0.0065 +[2025-02-21 09:32:27] (step=0471800) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 09:32:35] (step=0471900) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:32:43] (step=0472000) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:32:51] (step=0472100) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:32:59] (step=0472200) Train Loss: 0.1475, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 09:33:09] (step=0472300) Train Loss: 0.1463, Train Steps/Sec: 10.37, Grad Norm: 0.0062 +[2025-02-21 09:33:18] (step=0472400) Train Loss: 0.1457, Train Steps/Sec: 10.68, Grad Norm: 0.0061 +[2025-02-21 09:33:26] (step=0472500) Train Loss: 0.1471, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 09:33:35] (step=0472600) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 09:33:44] (step=0472700) Train Loss: 0.1457, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 09:33:52] (step=0472800) Train Loss: 0.1449, Train Steps/Sec: 11.39, Grad Norm: 0.0058 +[2025-02-21 09:34:02] (step=0472900) Train Loss: 0.1463, Train Steps/Sec: 10.57, Grad Norm: 0.0060 +[2025-02-21 09:34:10] (step=0473000) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 09:34:18] (step=0473100) Train Loss: 0.1478, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 09:34:26] (step=0473200) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 09:34:34] (step=0473300) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 09:34:42] (step=0473400) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:34:51] (step=0473500) Train Loss: 0.1471, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 09:34:59] (step=0473600) Train Loss: 0.1455, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 09:35:08] (step=0473700) Train Loss: 0.1464, Train Steps/Sec: 10.92, Grad Norm: 0.0059 +[2025-02-21 09:35:16] (step=0473800) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 09:35:24] (step=0473900) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 09:35:32] (step=0474000) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 09:35:40] (step=0474100) Train Loss: 0.1475, Train Steps/Sec: 12.52, Grad Norm: 0.0058 +[2025-02-21 09:35:49] (step=0474200) Train Loss: 0.1470, Train Steps/Sec: 11.27, Grad Norm: 0.0057 +[2025-02-21 09:35:57] (step=0474300) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 09:36:05] (step=0474400) Train Loss: 0.1461, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 09:36:15] (step=0474500) Train Loss: 0.1466, Train Steps/Sec: 10.35, Grad Norm: 0.0063 +[2025-02-21 09:36:24] (step=0474600) Train Loss: 0.1460, Train Steps/Sec: 11.00, Grad Norm: 0.0062 +[2025-02-21 09:36:33] (step=0474700) Train Loss: 0.1464, Train Steps/Sec: 11.70, Grad Norm: 0.0058 +[2025-02-21 09:36:41] (step=0474800) Train Loss: 0.1455, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 09:36:51] (step=0474900) Train Loss: 0.1461, Train Steps/Sec: 10.99, Grad Norm: 0.0063 +[2025-02-21 09:36:59] (step=0475000) Train Loss: 0.1471, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 09:37:07] (step=0475100) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 09:37:15] (step=0475200) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:37:23] (step=0475300) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:37:32] (step=0475400) Train Loss: 0.1470, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 09:37:40] (step=0475500) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:37:48] (step=0475600) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 09:37:57] (step=0475700) Train Loss: 0.1468, Train Steps/Sec: 10.63, Grad Norm: 0.0060 +[2025-02-21 09:38:05] (step=0475800) Train Loss: 0.1468, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 09:38:14] (step=0475900) Train Loss: 0.1459, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 09:38:22] (step=0476000) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 09:38:30] (step=0476100) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 09:38:38] (step=0476200) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:38:46] (step=0476300) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:38:54] (step=0476400) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:39:02] (step=0476500) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:39:10] (step=0476600) Train Loss: 0.1467, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 09:39:21] (step=0476700) Train Loss: 0.1475, Train Steps/Sec: 9.21, Grad Norm: 0.0059 +[2025-02-21 09:39:30] (step=0476800) Train Loss: 0.1457, Train Steps/Sec: 11.37, Grad Norm: 0.0066 +[2025-02-21 09:39:39] (step=0476900) Train Loss: 0.1454, Train Steps/Sec: 10.95, Grad Norm: 0.0058 +[2025-02-21 09:39:47] (step=0477000) Train Loss: 0.1458, Train Steps/Sec: 12.22, Grad Norm: 0.0058 +[2025-02-21 09:39:57] (step=0477100) Train Loss: 0.1474, Train Steps/Sec: 10.65, Grad Norm: 0.0059 +[2025-02-21 09:40:05] (step=0477200) Train Loss: 0.1472, Train Steps/Sec: 12.20, Grad Norm: 0.0061 +[2025-02-21 09:40:13] (step=0477300) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 09:40:21] (step=0477400) Train Loss: 0.1473, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 09:40:28] (step=0477500) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 09:40:36] (step=0477600) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 09:40:44] (step=0477700) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:40:52] (step=0477800) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 09:41:03] (step=0477900) Train Loss: 0.1477, Train Steps/Sec: 9.66, Grad Norm: 0.0060 +[2025-02-21 09:41:11] (step=0478000) Train Loss: 0.1456, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 09:41:20] (step=0478100) Train Loss: 0.1466, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 09:41:28] (step=0478200) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:41:36] (step=0478300) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 09:41:44] (step=0478400) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:41:51] (step=0478500) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:41:59] (step=0478600) Train Loss: 0.1478, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 09:42:07] (step=0478700) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 09:42:16] (step=0478800) Train Loss: 0.1463, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 09:42:26] (step=0478900) Train Loss: 0.1462, Train Steps/Sec: 9.71, Grad Norm: 0.0059 +[2025-02-21 09:42:35] (step=0479000) Train Loss: 0.1469, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 09:42:44] (step=0479100) Train Loss: 0.1471, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 09:42:54] (step=0479200) Train Loss: 0.1464, Train Steps/Sec: 10.23, Grad Norm: 0.0059 +[2025-02-21 09:43:03] (step=0479300) Train Loss: 0.1461, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 09:43:11] (step=0479400) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 09:43:19] (step=0479500) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:43:27] (step=0479600) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:43:34] (step=0479700) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 09:43:42] (step=0479800) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 09:43:50] (step=0479900) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 09:43:58] (step=0480000) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 09:44:03] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0480000.pt +[2025-02-21 09:44:12] (step=0480100) Train Loss: 0.1453, Train Steps/Sec: 7.15, Grad Norm: 0.0062 +[2025-02-21 09:44:21] (step=0480200) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 09:44:29] (step=0480300) Train Loss: 0.1471, Train Steps/Sec: 11.35, Grad Norm: 0.0063 +[2025-02-21 09:44:38] (step=0480400) Train Loss: 0.1463, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-21 09:44:46] (step=0480500) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 09:44:54] (step=0480600) Train Loss: 0.1448, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 09:45:02] (step=0480700) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 09:45:10] (step=0480800) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:45:18] (step=0480900) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 09:45:27] (step=0481000) Train Loss: 0.1461, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 09:45:37] (step=0481100) Train Loss: 0.1469, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-21 09:45:46] (step=0481200) Train Loss: 0.1467, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 09:45:55] (step=0481300) Train Loss: 0.1465, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-21 09:46:04] (step=0481400) Train Loss: 0.1466, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 09:46:12] (step=0481500) Train Loss: 0.1441, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 09:46:20] (step=0481600) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 09:46:29] (step=0481700) Train Loss: 0.1455, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-21 09:46:37] (step=0481800) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 09:46:45] (step=0481900) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 09:46:53] (step=0482000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 09:47:00] (step=0482100) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 09:47:08] (step=0482200) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 09:47:18] (step=0482300) Train Loss: 0.1465, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 09:47:26] (step=0482400) Train Loss: 0.1455, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 09:47:35] (step=0482500) Train Loss: 0.1469, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 09:47:43] (step=0482600) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 09:47:50] (step=0482700) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 09:47:58] (step=0482800) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 09:48:07] (step=0482900) Train Loss: 0.1465, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-21 09:48:15] (step=0483000) Train Loss: 0.1471, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 09:48:23] (step=0483100) Train Loss: 0.1465, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 09:48:33] (step=0483200) Train Loss: 0.1461, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-21 09:48:43] (step=0483300) Train Loss: 0.1471, Train Steps/Sec: 10.35, Grad Norm: 0.0060 +[2025-02-21 09:48:51] (step=0483400) Train Loss: 0.1471, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 09:49:00] (step=0483500) Train Loss: 0.1477, Train Steps/Sec: 10.98, Grad Norm: 0.0062 +[2025-02-21 09:49:09] (step=0483600) Train Loss: 0.1468, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 09:49:17] (step=0483700) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:49:25] (step=0483800) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:49:33] (step=0483900) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:49:41] (step=0484000) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 09:49:49] (step=0484100) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 09:49:58] (step=0484200) Train Loss: 0.1479, Train Steps/Sec: 11.20, Grad Norm: 0.0058 +[2025-02-21 09:50:05] (step=0484300) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 09:50:13] (step=0484400) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 09:50:22] (step=0484500) Train Loss: 0.1463, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 09:50:31] (step=0484600) Train Loss: 0.1452, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 09:50:39] (step=0484700) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 09:50:47] (step=0484800) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 09:50:55] (step=0484900) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 09:51:03] (step=0485000) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 09:51:11] (step=0485100) Train Loss: 0.1485, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 09:51:19] (step=0485200) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 09:51:27] (step=0485300) Train Loss: 0.1461, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 09:51:38] (step=0485400) Train Loss: 0.1466, Train Steps/Sec: 9.68, Grad Norm: 0.0060 +[2025-02-21 09:51:47] (step=0485500) Train Loss: 0.1451, Train Steps/Sec: 10.31, Grad Norm: 0.0066 +[2025-02-21 09:51:57] (step=0485600) Train Loss: 0.1458, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-21 09:52:05] (step=0485700) Train Loss: 0.1475, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 09:52:14] (step=0485800) Train Loss: 0.1474, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 09:52:21] (step=0485900) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 09:52:29] (step=0486000) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 09:52:37] (step=0486100) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 09:52:45] (step=0486200) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 09:52:53] (step=0486300) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 09:53:01] (step=0486400) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 09:53:09] (step=0486500) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 09:53:17] (step=0486600) Train Loss: 0.1463, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 09:53:27] (step=0486700) Train Loss: 0.1457, Train Steps/Sec: 9.95, Grad Norm: 0.0060 +[2025-02-21 09:53:36] (step=0486800) Train Loss: 0.1471, Train Steps/Sec: 11.29, Grad Norm: 0.0058 +[2025-02-21 09:53:45] (step=0486900) Train Loss: 0.1470, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 09:53:52] (step=0487000) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 09:54:00] (step=0487100) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 09:54:08] (step=0487200) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 09:54:16] (step=0487300) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 09:54:24] (step=0487400) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:54:32] (step=0487500) Train Loss: 0.1467, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 09:54:42] (step=0487600) Train Loss: 0.1460, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-21 09:54:52] (step=0487700) Train Loss: 0.1462, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-21 09:55:02] (step=0487800) Train Loss: 0.1465, Train Steps/Sec: 10.64, Grad Norm: 0.0060 +[2025-02-21 09:55:11] (step=0487900) Train Loss: 0.1477, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 09:55:19] (step=0488000) Train Loss: 0.1469, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 09:55:27] (step=0488100) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 09:55:35] (step=0488200) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:55:43] (step=0488300) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 09:55:51] (step=0488400) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 09:55:59] (step=0488500) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 09:56:07] (step=0488600) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 09:56:15] (step=0488700) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:56:23] (step=0488800) Train Loss: 0.1466, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 09:56:32] (step=0488900) Train Loss: 0.1463, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 09:56:41] (step=0489000) Train Loss: 0.1465, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 09:56:49] (step=0489100) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 09:56:58] (step=0489200) Train Loss: 0.1470, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-21 09:57:06] (step=0489300) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 09:57:14] (step=0489400) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 09:57:22] (step=0489500) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:57:30] (step=0489600) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 09:57:39] (step=0489700) Train Loss: 0.1462, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 09:57:48] (step=0489800) Train Loss: 0.1462, Train Steps/Sec: 10.09, Grad Norm: 0.0060 +[2025-02-21 09:57:58] (step=0489900) Train Loss: 0.1475, Train Steps/Sec: 10.01, Grad Norm: 0.0058 +[2025-02-21 09:58:08] (step=0490000) Train Loss: 0.1456, Train Steps/Sec: 10.94, Grad Norm: 0.0064 +[2025-02-21 09:58:16] (step=0490100) Train Loss: 0.1443, Train Steps/Sec: 12.54, Grad Norm: 0.0060 +[2025-02-21 09:58:24] (step=0490200) Train Loss: 0.1468, Train Steps/Sec: 12.08, Grad Norm: 0.0061 +[2025-02-21 09:58:32] (step=0490300) Train Loss: 0.1463, Train Steps/Sec: 12.53, Grad Norm: 0.0058 +[2025-02-21 09:58:41] (step=0490400) Train Loss: 0.1474, Train Steps/Sec: 11.21, Grad Norm: 0.0058 +[2025-02-21 09:58:49] (step=0490500) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 09:58:57] (step=0490600) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 09:59:05] (step=0490700) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 09:59:12] (step=0490800) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 09:59:20] (step=0490900) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 09:59:29] (step=0491000) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 09:59:38] (step=0491100) Train Loss: 0.1467, Train Steps/Sec: 11.37, Grad Norm: 0.0057 +[2025-02-21 09:59:47] (step=0491200) Train Loss: 0.1472, Train Steps/Sec: 11.35, Grad Norm: 0.0057 +[2025-02-21 09:59:55] (step=0491300) Train Loss: 0.1461, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 10:00:03] (step=0491400) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 10:00:11] (step=0491500) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:00:19] (step=0491600) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 10:00:28] (step=0491700) Train Loss: 0.1459, Train Steps/Sec: 11.23, Grad Norm: 0.0058 +[2025-02-21 10:00:36] (step=0491800) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 10:00:45] (step=0491900) Train Loss: 0.1460, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 10:00:56] (step=0492000) Train Loss: 0.1460, Train Steps/Sec: 8.77, Grad Norm: 0.0061 +[2025-02-21 10:01:05] (step=0492100) Train Loss: 0.1471, Train Steps/Sec: 11.06, Grad Norm: 0.0060 +[2025-02-21 10:01:14] (step=0492200) Train Loss: 0.1451, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 10:01:21] (step=0492300) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 10:01:30] (step=0492400) Train Loss: 0.1450, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 10:01:38] (step=0492500) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 10:01:46] (step=0492600) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 10:01:54] (step=0492700) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 10:02:01] (step=0492800) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:02:10] (step=0492900) Train Loss: 0.1473, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 10:02:18] (step=0493000) Train Loss: 0.1455, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 10:02:26] (step=0493100) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:02:35] (step=0493200) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 10:02:44] (step=0493300) Train Loss: 0.1454, Train Steps/Sec: 11.36, Grad Norm: 0.0066 +[2025-02-21 10:02:52] (step=0493400) Train Loss: 0.1459, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 10:03:01] (step=0493500) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 10:03:09] (step=0493600) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 10:03:16] (step=0493700) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 10:03:24] (step=0493800) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 10:03:32] (step=0493900) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 10:03:41] (step=0494000) Train Loss: 0.1456, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 10:03:50] (step=0494100) Train Loss: 0.1483, Train Steps/Sec: 10.69, Grad Norm: 0.0060 +[2025-02-21 10:04:02] (step=0494200) Train Loss: 0.1472, Train Steps/Sec: 8.07, Grad Norm: 0.0060 +[2025-02-21 10:04:11] (step=0494300) Train Loss: 0.1461, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 10:04:19] (step=0494400) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 10:04:28] (step=0494500) Train Loss: 0.1467, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 10:04:35] (step=0494600) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 10:04:43] (step=0494700) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 10:04:51] (step=0494800) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 10:04:59] (step=0494900) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:05:07] (step=0495000) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:05:15] (step=0495100) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 10:05:23] (step=0495200) Train Loss: 0.1459, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 10:05:31] (step=0495300) Train Loss: 0.1472, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 10:05:41] (step=0495400) Train Loss: 0.1468, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-21 10:05:49] (step=0495500) Train Loss: 0.1477, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 10:05:58] (step=0495600) Train Loss: 0.1465, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 10:06:06] (step=0495700) Train Loss: 0.1473, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 10:06:14] (step=0495800) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:06:22] (step=0495900) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:06:30] (step=0496000) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 10:06:38] (step=0496100) Train Loss: 0.1457, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 10:06:47] (step=0496200) Train Loss: 0.1456, Train Steps/Sec: 11.47, Grad Norm: 0.0062 +[2025-02-21 10:06:57] (step=0496300) Train Loss: 0.1464, Train Steps/Sec: 10.36, Grad Norm: 0.0058 +[2025-02-21 10:07:07] (step=0496400) Train Loss: 0.1470, Train Steps/Sec: 9.48, Grad Norm: 0.0069 +[2025-02-21 10:07:16] (step=0496500) Train Loss: 0.1454, Train Steps/Sec: 11.32, Grad Norm: 0.0061 +[2025-02-21 10:07:24] (step=0496600) Train Loss: 0.1451, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 10:07:33] (step=0496700) Train Loss: 0.1471, Train Steps/Sec: 10.83, Grad Norm: 0.0060 +[2025-02-21 10:07:41] (step=0496800) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 10:07:49] (step=0496900) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:07:57] (step=0497000) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:08:05] (step=0497100) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 10:08:13] (step=0497200) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:08:21] (step=0497300) Train Loss: 0.1473, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 10:08:29] (step=0497400) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 10:08:37] (step=0497500) Train Loss: 0.1467, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 10:08:45] (step=0497600) Train Loss: 0.1457, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 10:08:54] (step=0497700) Train Loss: 0.1471, Train Steps/Sec: 11.75, Grad Norm: 0.0068 +[2025-02-21 10:09:03] (step=0497800) Train Loss: 0.1465, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-21 10:09:12] (step=0497900) Train Loss: 0.1447, Train Steps/Sec: 10.81, Grad Norm: 0.0060 +[2025-02-21 10:09:20] (step=0498000) Train Loss: 0.1450, Train Steps/Sec: 12.54, Grad Norm: 0.0060 +[2025-02-21 10:09:28] (step=0498100) Train Loss: 0.1458, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 10:09:36] (step=0498200) Train Loss: 0.1473, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 10:09:45] (step=0498300) Train Loss: 0.1455, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 10:09:54] (step=0498400) Train Loss: 0.1461, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 10:10:03] (step=0498500) Train Loss: 0.1476, Train Steps/Sec: 10.97, Grad Norm: 0.0057 +[2025-02-21 10:10:14] (step=0498600) Train Loss: 0.1453, Train Steps/Sec: 9.49, Grad Norm: 0.0057 +[2025-02-21 10:10:22] (step=0498700) Train Loss: 0.1461, Train Steps/Sec: 11.65, Grad Norm: 0.0061 +[2025-02-21 10:10:30] (step=0498800) Train Loss: 0.1445, Train Steps/Sec: 12.53, Grad Norm: 0.0057 +[2025-02-21 10:10:38] (step=0498900) Train Loss: 0.1456, Train Steps/Sec: 12.05, Grad Norm: 0.0063 +[2025-02-21 10:10:46] (step=0499000) Train Loss: 0.1475, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 10:10:54] (step=0499100) Train Loss: 0.1464, Train Steps/Sec: 12.51, Grad Norm: 0.0060 +[2025-02-21 10:11:03] (step=0499200) Train Loss: 0.1482, Train Steps/Sec: 11.12, Grad Norm: 0.0060 +[2025-02-21 10:11:11] (step=0499300) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 10:11:19] (step=0499400) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 10:11:27] (step=0499500) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 10:11:35] (step=0499600) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:11:43] (step=0499700) Train Loss: 0.1460, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 10:11:52] (step=0499800) Train Loss: 0.1465, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 10:12:01] (step=0499900) Train Loss: 0.1466, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 10:12:10] (step=0500000) Train Loss: 0.1471, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 10:12:13] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0500000.pt +[2025-02-21 10:12:21] (step=0500100) Train Loss: 0.1467, Train Steps/Sec: 9.06, Grad Norm: 0.0060 +[2025-02-21 10:12:29] (step=0500200) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 10:12:37] (step=0500300) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 10:12:45] (step=0500400) Train Loss: 0.1456, Train Steps/Sec: 11.59, Grad Norm: 0.0060 +[2025-02-21 10:12:55] (step=0500500) Train Loss: 0.1457, Train Steps/Sec: 10.69, Grad Norm: 0.0061 +[2025-02-21 10:13:04] (step=0500600) Train Loss: 0.1467, Train Steps/Sec: 10.63, Grad Norm: 0.0062 +[2025-02-21 10:13:13] (step=0500700) Train Loss: 0.1470, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 10:13:23] (step=0500800) Train Loss: 0.1462, Train Steps/Sec: 9.74, Grad Norm: 0.0060 +[2025-02-21 10:13:32] (step=0500900) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 10:13:40] (step=0501000) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 10:13:48] (step=0501100) Train Loss: 0.1452, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 10:13:56] (step=0501200) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:14:04] (step=0501300) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:14:12] (step=0501400) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:14:20] (step=0501500) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 10:14:27] (step=0501600) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 10:14:36] (step=0501700) Train Loss: 0.1458, Train Steps/Sec: 11.24, Grad Norm: 0.0058 +[2025-02-21 10:14:45] (step=0501800) Train Loss: 0.1446, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 10:14:53] (step=0501900) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 10:15:01] (step=0502000) Train Loss: 0.1465, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 10:15:10] (step=0502100) Train Loss: 0.1456, Train Steps/Sec: 12.21, Grad Norm: 0.0059 +[2025-02-21 10:15:19] (step=0502200) Train Loss: 0.1471, Train Steps/Sec: 10.97, Grad Norm: 0.0063 +[2025-02-21 10:15:27] (step=0502300) Train Loss: 0.1480, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 10:15:35] (step=0502400) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:15:43] (step=0502500) Train Loss: 0.1474, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 10:15:51] (step=0502600) Train Loss: 0.1465, Train Steps/Sec: 11.84, Grad Norm: 0.0063 +[2025-02-21 10:16:00] (step=0502700) Train Loss: 0.1452, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 10:16:09] (step=0502800) Train Loss: 0.1473, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 10:16:18] (step=0502900) Train Loss: 0.1456, Train Steps/Sec: 11.74, Grad Norm: 0.0057 +[2025-02-21 10:16:29] (step=0503000) Train Loss: 0.1467, Train Steps/Sec: 8.94, Grad Norm: 0.0059 +[2025-02-21 10:16:37] (step=0503100) Train Loss: 0.1459, Train Steps/Sec: 11.70, Grad Norm: 0.0065 +[2025-02-21 10:16:45] (step=0503200) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 10:16:53] (step=0503300) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 10:17:01] (step=0503400) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 10:17:09] (step=0503500) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:17:17] (step=0503600) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 10:17:25] (step=0503700) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:17:33] (step=0503800) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0067 +[2025-02-21 10:17:41] (step=0503900) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 10:17:49] (step=0504000) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 10:17:57] (step=0504100) Train Loss: 0.1444, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 10:18:07] (step=0504200) Train Loss: 0.1457, Train Steps/Sec: 10.23, Grad Norm: 0.0059 +[2025-02-21 10:18:15] (step=0504300) Train Loss: 0.1471, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 10:18:24] (step=0504400) Train Loss: 0.1480, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-21 10:18:32] (step=0504500) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:18:40] (step=0504600) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 10:18:49] (step=0504700) Train Loss: 0.1448, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 10:18:57] (step=0504800) Train Loss: 0.1457, Train Steps/Sec: 11.42, Grad Norm: 0.0060 +[2025-02-21 10:19:06] (step=0504900) Train Loss: 0.1468, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 10:19:15] (step=0505000) Train Loss: 0.1477, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 10:19:24] (step=0505100) Train Loss: 0.1452, Train Steps/Sec: 10.66, Grad Norm: 0.0061 +[2025-02-21 10:19:33] (step=0505200) Train Loss: 0.1461, Train Steps/Sec: 10.68, Grad Norm: 0.0062 +[2025-02-21 10:19:42] (step=0505300) Train Loss: 0.1471, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 10:19:50] (step=0505400) Train Loss: 0.1474, Train Steps/Sec: 12.53, Grad Norm: 0.0060 +[2025-02-21 10:19:59] (step=0505500) Train Loss: 0.1473, Train Steps/Sec: 10.88, Grad Norm: 0.0063 +[2025-02-21 10:20:07] (step=0505600) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 10:20:15] (step=0505700) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 10:20:23] (step=0505800) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 10:20:31] (step=0505900) Train Loss: 0.1450, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 10:20:39] (step=0506000) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:20:47] (step=0506100) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:20:55] (step=0506200) Train Loss: 0.1452, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 10:21:03] (step=0506300) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:21:12] (step=0506400) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 10:21:20] (step=0506500) Train Loss: 0.1468, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 10:21:29] (step=0506600) Train Loss: 0.1462, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 10:21:38] (step=0506700) Train Loss: 0.1468, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 10:21:46] (step=0506800) Train Loss: 0.1472, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 10:21:55] (step=0506900) Train Loss: 0.1467, Train Steps/Sec: 11.82, Grad Norm: 0.0056 +[2025-02-21 10:22:04] (step=0507000) Train Loss: 0.1463, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 10:22:12] (step=0507100) Train Loss: 0.1461, Train Steps/Sec: 12.21, Grad Norm: 0.0058 +[2025-02-21 10:22:20] (step=0507200) Train Loss: 0.1468, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 10:22:30] (step=0507300) Train Loss: 0.1453, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-21 10:22:39] (step=0507400) Train Loss: 0.1466, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 10:22:48] (step=0507500) Train Loss: 0.1459, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-21 10:22:56] (step=0507600) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:23:04] (step=0507700) Train Loss: 0.1452, Train Steps/Sec: 12.13, Grad Norm: 0.0066 +[2025-02-21 10:23:12] (step=0507800) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 10:23:20] (step=0507900) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 10:23:29] (step=0508000) Train Loss: 0.1450, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 10:23:37] (step=0508100) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 10:23:45] (step=0508200) Train Loss: 0.1459, Train Steps/Sec: 12.53, Grad Norm: 0.0060 +[2025-02-21 10:23:53] (step=0508300) Train Loss: 0.1470, Train Steps/Sec: 12.50, Grad Norm: 0.0060 +[2025-02-21 10:24:01] (step=0508400) Train Loss: 0.1453, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 10:24:09] (step=0508500) Train Loss: 0.1450, Train Steps/Sec: 12.13, Grad Norm: 0.0057 +[2025-02-21 10:24:18] (step=0508600) Train Loss: 0.1465, Train Steps/Sec: 11.36, Grad Norm: 0.0057 +[2025-02-21 10:24:26] (step=0508700) Train Loss: 0.1478, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 10:24:35] (step=0508800) Train Loss: 0.1454, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 10:24:43] (step=0508900) Train Loss: 0.1471, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 10:24:52] (step=0509000) Train Loss: 0.1469, Train Steps/Sec: 11.78, Grad Norm: 0.0062 +[2025-02-21 10:25:01] (step=0509100) Train Loss: 0.1469, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 10:25:10] (step=0509200) Train Loss: 0.1463, Train Steps/Sec: 10.42, Grad Norm: 0.0058 +[2025-02-21 10:25:18] (step=0509300) Train Loss: 0.1470, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 10:25:27] (step=0509400) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 10:25:37] (step=0509500) Train Loss: 0.1469, Train Steps/Sec: 9.75, Grad Norm: 0.0061 +[2025-02-21 10:25:45] (step=0509600) Train Loss: 0.1482, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 10:25:54] (step=0509700) Train Loss: 0.1471, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 10:26:02] (step=0509800) Train Loss: 0.1469, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 10:26:10] (step=0509900) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:26:18] (step=0510000) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 10:26:26] (step=0510100) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:26:34] (step=0510200) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 10:26:42] (step=0510300) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 10:26:50] (step=0510400) Train Loss: 0.1461, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 10:26:59] (step=0510500) Train Loss: 0.1479, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-21 10:27:07] (step=0510600) Train Loss: 0.1464, Train Steps/Sec: 12.12, Grad Norm: 0.0058 +[2025-02-21 10:27:15] (step=0510700) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 10:27:24] (step=0510800) Train Loss: 0.1470, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 10:27:32] (step=0510900) Train Loss: 0.1455, Train Steps/Sec: 11.75, Grad Norm: 0.0057 +[2025-02-21 10:27:41] (step=0511000) Train Loss: 0.1465, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 10:27:49] (step=0511100) Train Loss: 0.1466, Train Steps/Sec: 11.79, Grad Norm: 0.0061 +[2025-02-21 10:27:58] (step=0511200) Train Loss: 0.1468, Train Steps/Sec: 11.79, Grad Norm: 0.0061 +[2025-02-21 10:28:07] (step=0511300) Train Loss: 0.1475, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 10:28:15] (step=0511400) Train Loss: 0.1463, Train Steps/Sec: 12.21, Grad Norm: 0.0061 +[2025-02-21 10:28:23] (step=0511500) Train Loss: 0.1464, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 10:28:32] (step=0511600) Train Loss: 0.1462, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 10:28:43] (step=0511700) Train Loss: 0.1455, Train Steps/Sec: 8.96, Grad Norm: 0.0059 +[2025-02-21 10:28:51] (step=0511800) Train Loss: 0.1465, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 10:29:00] (step=0511900) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 10:29:08] (step=0512000) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 10:29:16] (step=0512100) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:29:24] (step=0512200) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:29:32] (step=0512300) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 10:29:40] (step=0512400) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:29:47] (step=0512500) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:29:55] (step=0512600) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:30:03] (step=0512700) Train Loss: 0.1477, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 10:30:12] (step=0512800) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 10:30:20] (step=0512900) Train Loss: 0.1460, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 10:30:30] (step=0513000) Train Loss: 0.1463, Train Steps/Sec: 9.95, Grad Norm: 0.0060 +[2025-02-21 10:30:38] (step=0513100) Train Loss: 0.1458, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 10:30:47] (step=0513200) Train Loss: 0.1454, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 10:30:55] (step=0513300) Train Loss: 0.1449, Train Steps/Sec: 11.80, Grad Norm: 0.0057 +[2025-02-21 10:31:05] (step=0513400) Train Loss: 0.1453, Train Steps/Sec: 11.00, Grad Norm: 0.0058 +[2025-02-21 10:31:13] (step=0513500) Train Loss: 0.1471, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 10:31:21] (step=0513600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 10:31:29] (step=0513700) Train Loss: 0.1452, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 10:31:37] (step=0513800) Train Loss: 0.1476, Train Steps/Sec: 11.76, Grad Norm: 0.0063 +[2025-02-21 10:31:48] (step=0513900) Train Loss: 0.1452, Train Steps/Sec: 9.52, Grad Norm: 0.0060 +[2025-02-21 10:31:56] (step=0514000) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 10:32:04] (step=0514100) Train Loss: 0.1465, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 10:32:14] (step=0514200) Train Loss: 0.1460, Train Steps/Sec: 10.82, Grad Norm: 0.0058 +[2025-02-21 10:32:22] (step=0514300) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 10:32:30] (step=0514400) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 10:32:37] (step=0514500) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:32:45] (step=0514600) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 10:32:53] (step=0514700) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 10:33:01] (step=0514800) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 10:33:10] (step=0514900) Train Loss: 0.1469, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 10:33:17] (step=0515000) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 10:33:26] (step=0515100) Train Loss: 0.1473, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 10:33:34] (step=0515200) Train Loss: 0.1458, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 10:33:44] (step=0515300) Train Loss: 0.1466, Train Steps/Sec: 10.97, Grad Norm: 0.0059 +[2025-02-21 10:33:52] (step=0515400) Train Loss: 0.1474, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 10:34:02] (step=0515500) Train Loss: 0.1461, Train Steps/Sec: 10.28, Grad Norm: 0.0058 +[2025-02-21 10:34:11] (step=0515600) Train Loss: 0.1447, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 10:34:19] (step=0515700) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 10:34:27] (step=0515800) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 10:34:35] (step=0515900) Train Loss: 0.1460, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 10:34:44] (step=0516000) Train Loss: 0.1448, Train Steps/Sec: 10.96, Grad Norm: 0.0057 +[2025-02-21 10:34:54] (step=0516100) Train Loss: 0.1464, Train Steps/Sec: 10.09, Grad Norm: 0.0061 +[2025-02-21 10:35:02] (step=0516200) Train Loss: 0.1453, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 10:35:10] (step=0516300) Train Loss: 0.1464, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 10:35:19] (step=0516400) Train Loss: 0.1462, Train Steps/Sec: 12.10, Grad Norm: 0.0062 +[2025-02-21 10:35:27] (step=0516500) Train Loss: 0.1459, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 10:35:34] (step=0516600) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 10:35:43] (step=0516700) Train Loss: 0.1465, Train Steps/Sec: 11.11, Grad Norm: 0.0059 +[2025-02-21 10:35:51] (step=0516800) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 10:35:59] (step=0516900) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 10:36:07] (step=0517000) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:36:16] (step=0517100) Train Loss: 0.1451, Train Steps/Sec: 12.12, Grad Norm: 0.0064 +[2025-02-21 10:36:23] (step=0517200) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 10:36:32] (step=0517300) Train Loss: 0.1468, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 10:36:40] (step=0517400) Train Loss: 0.1454, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 10:36:50] (step=0517500) Train Loss: 0.1477, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-21 10:36:59] (step=0517600) Train Loss: 0.1461, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 10:37:08] (step=0517700) Train Loss: 0.1464, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 10:37:16] (step=0517800) Train Loss: 0.1465, Train Steps/Sec: 12.22, Grad Norm: 0.0060 +[2025-02-21 10:37:24] (step=0517900) Train Loss: 0.1455, Train Steps/Sec: 12.55, Grad Norm: 0.0056 +[2025-02-21 10:37:33] (step=0518000) Train Loss: 0.1468, Train Steps/Sec: 11.28, Grad Norm: 0.0064 +[2025-02-21 10:37:41] (step=0518100) Train Loss: 0.1462, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 10:37:51] (step=0518200) Train Loss: 0.1470, Train Steps/Sec: 10.32, Grad Norm: 0.0058 +[2025-02-21 10:38:00] (step=0518300) Train Loss: 0.1464, Train Steps/Sec: 10.69, Grad Norm: 0.0059 +[2025-02-21 10:38:08] (step=0518400) Train Loss: 0.1463, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 10:38:16] (step=0518500) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 10:38:25] (step=0518600) Train Loss: 0.1458, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 10:38:33] (step=0518700) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 10:38:40] (step=0518800) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:38:48] (step=0518900) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 10:38:56] (step=0519000) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 10:39:04] (step=0519100) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 10:39:13] (step=0519200) Train Loss: 0.1449, Train Steps/Sec: 11.24, Grad Norm: 0.0059 +[2025-02-21 10:39:21] (step=0519300) Train Loss: 0.1465, Train Steps/Sec: 12.08, Grad Norm: 0.0061 +[2025-02-21 10:39:29] (step=0519400) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:39:38] (step=0519500) Train Loss: 0.1476, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 10:39:47] (step=0519600) Train Loss: 0.1465, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 10:39:56] (step=0519700) Train Loss: 0.1446, Train Steps/Sec: 10.71, Grad Norm: 0.0063 +[2025-02-21 10:40:05] (step=0519800) Train Loss: 0.1456, Train Steps/Sec: 10.97, Grad Norm: 0.0064 +[2025-02-21 10:40:14] (step=0519900) Train Loss: 0.1463, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 10:40:22] (step=0520000) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:40:26] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0520000.pt +[2025-02-21 10:40:33] (step=0520100) Train Loss: 0.1456, Train Steps/Sec: 8.43, Grad Norm: 0.0058 +[2025-02-21 10:40:41] (step=0520200) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 10:40:50] (step=0520300) Train Loss: 0.1459, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 10:40:59] (step=0520400) Train Loss: 0.1466, Train Steps/Sec: 10.30, Grad Norm: 0.0061 +[2025-02-21 10:41:10] (step=0520500) Train Loss: 0.1452, Train Steps/Sec: 9.77, Grad Norm: 0.0058 +[2025-02-21 10:41:18] (step=0520600) Train Loss: 0.1458, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 10:41:26] (step=0520700) Train Loss: 0.1468, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 10:41:34] (step=0520800) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 10:41:42] (step=0520900) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:41:50] (step=0521000) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 10:41:58] (step=0521100) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:42:06] (step=0521200) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 10:42:14] (step=0521300) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 10:42:22] (step=0521400) Train Loss: 0.1468, Train Steps/Sec: 12.13, Grad Norm: 0.0057 +[2025-02-21 10:42:30] (step=0521500) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:42:38] (step=0521600) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 10:42:48] (step=0521700) Train Loss: 0.1467, Train Steps/Sec: 9.79, Grad Norm: 0.0057 +[2025-02-21 10:42:57] (step=0521800) Train Loss: 0.1457, Train Steps/Sec: 11.01, Grad Norm: 0.0059 +[2025-02-21 10:43:07] (step=0521900) Train Loss: 0.1470, Train Steps/Sec: 10.69, Grad Norm: 0.0057 +[2025-02-21 10:43:15] (step=0522000) Train Loss: 0.1475, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 10:43:23] (step=0522100) Train Loss: 0.1463, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 10:43:31] (step=0522200) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:43:39] (step=0522300) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 10:43:47] (step=0522400) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:43:56] (step=0522500) Train Loss: 0.1473, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 10:44:05] (step=0522600) Train Loss: 0.1458, Train Steps/Sec: 10.69, Grad Norm: 0.0059 +[2025-02-21 10:44:14] (step=0522700) Train Loss: 0.1459, Train Steps/Sec: 10.66, Grad Norm: 0.0060 +[2025-02-21 10:44:23] (step=0522800) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 10:44:31] (step=0522900) Train Loss: 0.1468, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 10:44:40] (step=0523000) Train Loss: 0.1464, Train Steps/Sec: 10.86, Grad Norm: 0.0059 +[2025-02-21 10:44:48] (step=0523100) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 10:44:56] (step=0523200) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:45:04] (step=0523300) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 10:45:12] (step=0523400) Train Loss: 0.1469, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 10:45:20] (step=0523500) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 10:45:28] (step=0523600) Train Loss: 0.1473, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 10:45:36] (step=0523700) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 10:45:44] (step=0523800) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 10:45:53] (step=0523900) Train Loss: 0.1446, Train Steps/Sec: 10.60, Grad Norm: 0.0061 +[2025-02-21 10:46:02] (step=0524000) Train Loss: 0.1478, Train Steps/Sec: 11.40, Grad Norm: 0.0061 +[2025-02-21 10:46:12] (step=0524100) Train Loss: 0.1454, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 10:46:21] (step=0524200) Train Loss: 0.1471, Train Steps/Sec: 10.55, Grad Norm: 0.0063 +[2025-02-21 10:46:29] (step=0524300) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 10:46:37] (step=0524400) Train Loss: 0.1476, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 10:46:45] (step=0524500) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 10:46:53] (step=0524600) Train Loss: 0.1458, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 10:47:01] (step=0524700) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 10:47:12] (step=0524800) Train Loss: 0.1465, Train Steps/Sec: 9.31, Grad Norm: 0.0060 +[2025-02-21 10:47:20] (step=0524900) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:47:29] (step=0525000) Train Loss: 0.1466, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 10:47:37] (step=0525100) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:47:45] (step=0525200) Train Loss: 0.1473, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 10:47:53] (step=0525300) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 10:48:01] (step=0525400) Train Loss: 0.1452, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 10:48:10] (step=0525500) Train Loss: 0.1454, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-21 10:48:18] (step=0525600) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:48:25] (step=0525700) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 10:48:34] (step=0525800) Train Loss: 0.1455, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 10:48:42] (step=0525900) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:48:50] (step=0526000) Train Loss: 0.1451, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 10:49:00] (step=0526100) Train Loss: 0.1462, Train Steps/Sec: 10.30, Grad Norm: 0.0058 +[2025-02-21 10:49:08] (step=0526200) Train Loss: 0.1475, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 10:49:18] (step=0526300) Train Loss: 0.1462, Train Steps/Sec: 10.31, Grad Norm: 0.0058 +[2025-02-21 10:49:26] (step=0526400) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 10:49:34] (step=0526500) Train Loss: 0.1460, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 10:49:42] (step=0526600) Train Loss: 0.1476, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 10:49:51] (step=0526700) Train Loss: 0.1467, Train Steps/Sec: 11.16, Grad Norm: 0.0058 +[2025-02-21 10:49:59] (step=0526800) Train Loss: 0.1468, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 10:50:08] (step=0526900) Train Loss: 0.1461, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-21 10:50:18] (step=0527000) Train Loss: 0.1461, Train Steps/Sec: 9.78, Grad Norm: 0.0060 +[2025-02-21 10:50:27] (step=0527100) Train Loss: 0.1465, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-21 10:50:35] (step=0527200) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 10:50:43] (step=0527300) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 10:50:51] (step=0527400) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 10:50:59] (step=0527500) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 10:51:07] (step=0527600) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 10:51:15] (step=0527700) Train Loss: 0.1462, Train Steps/Sec: 12.52, Grad Norm: 0.0059 +[2025-02-21 10:51:23] (step=0527800) Train Loss: 0.1465, Train Steps/Sec: 12.52, Grad Norm: 0.0058 +[2025-02-21 10:51:31] (step=0527900) Train Loss: 0.1465, Train Steps/Sec: 12.46, Grad Norm: 0.0060 +[2025-02-21 10:51:40] (step=0528000) Train Loss: 0.1473, Train Steps/Sec: 10.80, Grad Norm: 0.0059 +[2025-02-21 10:51:48] (step=0528100) Train Loss: 0.1453, Train Steps/Sec: 12.07, Grad Norm: 0.0060 +[2025-02-21 10:51:57] (step=0528200) Train Loss: 0.1462, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 10:52:07] (step=0528300) Train Loss: 0.1464, Train Steps/Sec: 9.72, Grad Norm: 0.0060 +[2025-02-21 10:52:16] (step=0528400) Train Loss: 0.1456, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 10:52:25] (step=0528500) Train Loss: 0.1468, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 10:52:33] (step=0528600) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 10:52:41] (step=0528700) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 10:52:49] (step=0528800) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 10:52:56] (step=0528900) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 10:53:05] (step=0529000) Train Loss: 0.1468, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 10:53:14] (step=0529100) Train Loss: 0.1457, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 10:53:25] (step=0529200) Train Loss: 0.1460, Train Steps/Sec: 8.94, Grad Norm: 0.0058 +[2025-02-21 10:53:33] (step=0529300) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 10:53:41] (step=0529400) Train Loss: 0.1460, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 10:53:49] (step=0529500) Train Loss: 0.1466, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 10:53:57] (step=0529600) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 10:54:05] (step=0529700) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 10:54:13] (step=0529800) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 10:54:21] (step=0529900) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 10:54:29] (step=0530000) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 10:54:37] (step=0530100) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 10:54:45] (step=0530200) Train Loss: 0.1471, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 10:54:54] (step=0530300) Train Loss: 0.1474, Train Steps/Sec: 12.13, Grad Norm: 0.0058 +[2025-02-21 10:55:03] (step=0530400) Train Loss: 0.1467, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 10:55:13] (step=0530500) Train Loss: 0.1471, Train Steps/Sec: 9.14, Grad Norm: 0.0058 +[2025-02-21 10:55:22] (step=0530600) Train Loss: 0.1458, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 10:55:31] (step=0530700) Train Loss: 0.1466, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 10:55:39] (step=0530800) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 10:55:47] (step=0530900) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:55:55] (step=0531000) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 10:56:03] (step=0531100) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 10:56:11] (step=0531200) Train Loss: 0.1462, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 10:56:20] (step=0531300) Train Loss: 0.1464, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 10:56:30] (step=0531400) Train Loss: 0.1453, Train Steps/Sec: 9.79, Grad Norm: 0.0057 +[2025-02-21 10:56:38] (step=0531500) Train Loss: 0.1468, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 10:56:46] (step=0531600) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:56:56] (step=0531700) Train Loss: 0.1460, Train Steps/Sec: 10.84, Grad Norm: 0.0057 +[2025-02-21 10:57:03] (step=0531800) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 10:57:11] (step=0531900) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 10:57:19] (step=0532000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:57:27] (step=0532100) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 10:57:35] (step=0532200) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 10:57:43] (step=0532300) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 10:57:52] (step=0532400) Train Loss: 0.1455, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 10:58:00] (step=0532500) Train Loss: 0.1463, Train Steps/Sec: 11.83, Grad Norm: 0.0059 +[2025-02-21 10:58:09] (step=0532600) Train Loss: 0.1476, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 10:58:19] (step=0532700) Train Loss: 0.1462, Train Steps/Sec: 10.32, Grad Norm: 0.0059 +[2025-02-21 10:58:27] (step=0532800) Train Loss: 0.1459, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 10:58:36] (step=0532900) Train Loss: 0.1467, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 10:58:45] (step=0533000) Train Loss: 0.1455, Train Steps/Sec: 11.26, Grad Norm: 0.0056 +[2025-02-21 10:58:52] (step=0533100) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 10:59:00] (step=0533200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 10:59:08] (step=0533300) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0056 +[2025-02-21 10:59:17] (step=0533400) Train Loss: 0.1478, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 10:59:26] (step=0533500) Train Loss: 0.1459, Train Steps/Sec: 11.05, Grad Norm: 0.0059 +[2025-02-21 10:59:36] (step=0533600) Train Loss: 0.1467, Train Steps/Sec: 10.35, Grad Norm: 0.0060 +[2025-02-21 10:59:44] (step=0533700) Train Loss: 0.1463, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 10:59:52] (step=0533800) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:00:00] (step=0533900) Train Loss: 0.1476, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 11:00:08] (step=0534000) Train Loss: 0.1447, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 11:00:16] (step=0534100) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0055 +[2025-02-21 11:00:25] (step=0534200) Train Loss: 0.1476, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-21 11:00:33] (step=0534300) Train Loss: 0.1468, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 11:00:41] (step=0534400) Train Loss: 0.1445, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 11:00:49] (step=0534500) Train Loss: 0.1461, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 11:00:58] (step=0534600) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 11:01:07] (step=0534700) Train Loss: 0.1451, Train Steps/Sec: 11.07, Grad Norm: 0.0058 +[2025-02-21 11:01:16] (step=0534800) Train Loss: 0.1462, Train Steps/Sec: 11.04, Grad Norm: 0.0058 +[2025-02-21 11:01:25] (step=0534900) Train Loss: 0.1463, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 11:01:33] (step=0535000) Train Loss: 0.1462, Train Steps/Sec: 11.38, Grad Norm: 0.0062 +[2025-02-21 11:01:42] (step=0535100) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 11:01:50] (step=0535200) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:01:57] (step=0535300) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 11:02:05] (step=0535400) Train Loss: 0.1466, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 11:02:14] (step=0535500) Train Loss: 0.1471, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-21 11:02:23] (step=0535600) Train Loss: 0.1462, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 11:02:33] (step=0535700) Train Loss: 0.1472, Train Steps/Sec: 10.33, Grad Norm: 0.0060 +[2025-02-21 11:02:42] (step=0535800) Train Loss: 0.1478, Train Steps/Sec: 11.02, Grad Norm: 0.0063 +[2025-02-21 11:02:50] (step=0535900) Train Loss: 0.1460, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 11:02:58] (step=0536000) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 11:03:06] (step=0536100) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 11:03:14] (step=0536200) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 11:03:22] (step=0536300) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 11:03:30] (step=0536400) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 11:03:38] (step=0536500) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 11:03:46] (step=0536600) Train Loss: 0.1478, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 11:03:56] (step=0536700) Train Loss: 0.1460, Train Steps/Sec: 10.53, Grad Norm: 0.0060 +[2025-02-21 11:04:04] (step=0536800) Train Loss: 0.1478, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 11:04:13] (step=0536900) Train Loss: 0.1457, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 11:04:22] (step=0537000) Train Loss: 0.1451, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 11:04:30] (step=0537100) Train Loss: 0.1465, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 11:04:39] (step=0537200) Train Loss: 0.1462, Train Steps/Sec: 11.34, Grad Norm: 0.0065 +[2025-02-21 11:04:47] (step=0537300) Train Loss: 0.1471, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 11:04:55] (step=0537400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 11:05:03] (step=0537500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 11:05:11] (step=0537600) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:05:19] (step=0537700) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 11:05:28] (step=0537800) Train Loss: 0.1456, Train Steps/Sec: 11.35, Grad Norm: 0.0057 +[2025-02-21 11:05:37] (step=0537900) Train Loss: 0.1457, Train Steps/Sec: 10.64, Grad Norm: 0.0057 +[2025-02-21 11:05:48] (step=0538000) Train Loss: 0.1464, Train Steps/Sec: 9.40, Grad Norm: 0.0058 +[2025-02-21 11:05:56] (step=0538100) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 11:06:04] (step=0538200) Train Loss: 0.1462, Train Steps/Sec: 12.12, Grad Norm: 0.0058 +[2025-02-21 11:06:12] (step=0538300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 11:06:20] (step=0538400) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:06:28] (step=0538500) Train Loss: 0.1459, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 11:06:36] (step=0538600) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 11:06:44] (step=0538700) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 11:06:52] (step=0538800) Train Loss: 0.1467, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 11:07:01] (step=0538900) Train Loss: 0.1465, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 11:07:10] (step=0539000) Train Loss: 0.1465, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 11:07:18] (step=0539100) Train Loss: 0.1454, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 11:07:28] (step=0539200) Train Loss: 0.1464, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-21 11:07:36] (step=0539300) Train Loss: 0.1452, Train Steps/Sec: 11.28, Grad Norm: 0.0057 +[2025-02-21 11:07:45] (step=0539400) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 11:07:53] (step=0539500) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 11:08:01] (step=0539600) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 11:08:09] (step=0539700) Train Loss: 0.1471, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:08:17] (step=0539800) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 11:08:25] (step=0539900) Train Loss: 0.1472, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 11:08:34] (step=0540000) Train Loss: 0.1444, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 11:08:39] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0540000.pt +[2025-02-21 11:08:49] (step=0540100) Train Loss: 0.1467, Train Steps/Sec: 6.60, Grad Norm: 0.0058 +[2025-02-21 11:08:58] (step=0540200) Train Loss: 0.1455, Train Steps/Sec: 11.30, Grad Norm: 0.0058 +[2025-02-21 11:09:06] (step=0540300) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 11:09:14] (step=0540400) Train Loss: 0.1466, Train Steps/Sec: 12.06, Grad Norm: 0.0057 +[2025-02-21 11:09:23] (step=0540500) Train Loss: 0.1460, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-21 11:09:31] (step=0540600) Train Loss: 0.1468, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 11:09:39] (step=0540700) Train Loss: 0.1470, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 11:09:47] (step=0540800) Train Loss: 0.1456, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 11:09:55] (step=0540900) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 11:10:04] (step=0541000) Train Loss: 0.1467, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-21 11:10:12] (step=0541100) Train Loss: 0.1463, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 11:10:21] (step=0541200) Train Loss: 0.1461, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 11:10:29] (step=0541300) Train Loss: 0.1458, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 11:10:38] (step=0541400) Train Loss: 0.1466, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 11:10:47] (step=0541500) Train Loss: 0.1462, Train Steps/Sec: 11.22, Grad Norm: 0.0060 +[2025-02-21 11:10:55] (step=0541600) Train Loss: 0.1462, Train Steps/Sec: 11.23, Grad Norm: 0.0058 +[2025-02-21 11:11:05] (step=0541700) Train Loss: 0.1460, Train Steps/Sec: 10.76, Grad Norm: 0.0057 +[2025-02-21 11:11:13] (step=0541800) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 11:11:21] (step=0541900) Train Loss: 0.1468, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 11:11:29] (step=0542000) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 11:11:37] (step=0542100) Train Loss: 0.1458, Train Steps/Sec: 12.16, Grad Norm: 0.0057 +[2025-02-21 11:11:45] (step=0542200) Train Loss: 0.1469, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 11:11:56] (step=0542300) Train Loss: 0.1479, Train Steps/Sec: 9.78, Grad Norm: 0.0058 +[2025-02-21 11:12:04] (step=0542400) Train Loss: 0.1454, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 11:12:12] (step=0542500) Train Loss: 0.1457, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 11:12:21] (step=0542600) Train Loss: 0.1454, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 11:12:29] (step=0542700) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 11:12:37] (step=0542800) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 11:12:44] (step=0542900) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:12:53] (step=0543000) Train Loss: 0.1465, Train Steps/Sec: 11.12, Grad Norm: 0.0059 +[2025-02-21 11:13:02] (step=0543100) Train Loss: 0.1469, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 11:13:11] (step=0543200) Train Loss: 0.1463, Train Steps/Sec: 11.01, Grad Norm: 0.0060 +[2025-02-21 11:13:20] (step=0543300) Train Loss: 0.1465, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 11:13:28] (step=0543400) Train Loss: 0.1466, Train Steps/Sec: 12.21, Grad Norm: 0.0057 +[2025-02-21 11:13:36] (step=0543500) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 11:13:44] (step=0543600) Train Loss: 0.1446, Train Steps/Sec: 12.19, Grad Norm: 0.0057 +[2025-02-21 11:13:53] (step=0543700) Train Loss: 0.1456, Train Steps/Sec: 11.31, Grad Norm: 0.0058 +[2025-02-21 11:14:02] (step=0543800) Train Loss: 0.1461, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 11:14:10] (step=0543900) Train Loss: 0.1488, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 11:14:18] (step=0544000) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 11:14:26] (step=0544100) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 11:14:35] (step=0544200) Train Loss: 0.1463, Train Steps/Sec: 11.18, Grad Norm: 0.0062 +[2025-02-21 11:14:43] (step=0544300) Train Loss: 0.1455, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 11:14:52] (step=0544400) Train Loss: 0.1463, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 11:15:01] (step=0544500) Train Loss: 0.1467, Train Steps/Sec: 10.64, Grad Norm: 0.0058 +[2025-02-21 11:15:10] (step=0544600) Train Loss: 0.1455, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-21 11:15:18] (step=0544700) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 11:15:26] (step=0544800) Train Loss: 0.1468, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 11:15:34] (step=0544900) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 11:15:42] (step=0545000) Train Loss: 0.1462, Train Steps/Sec: 12.54, Grad Norm: 0.0064 +[2025-02-21 11:15:50] (step=0545100) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 11:15:59] (step=0545200) Train Loss: 0.1475, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 11:16:07] (step=0545300) Train Loss: 0.1458, Train Steps/Sec: 11.82, Grad Norm: 0.0064 +[2025-02-21 11:16:16] (step=0545400) Train Loss: 0.1469, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 11:16:26] (step=0545500) Train Loss: 0.1480, Train Steps/Sec: 10.25, Grad Norm: 0.0060 +[2025-02-21 11:16:33] (step=0545600) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 11:16:41] (step=0545700) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 11:16:50] (step=0545800) Train Loss: 0.1468, Train Steps/Sec: 12.16, Grad Norm: 0.0064 +[2025-02-21 11:16:59] (step=0545900) Train Loss: 0.1470, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-21 11:17:07] (step=0546000) Train Loss: 0.1471, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 11:17:16] (step=0546100) Train Loss: 0.1473, Train Steps/Sec: 12.10, Grad Norm: 0.0058 +[2025-02-21 11:17:24] (step=0546200) Train Loss: 0.1469, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 11:17:32] (step=0546300) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 11:17:39] (step=0546400) Train Loss: 0.1459, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 11:17:48] (step=0546500) Train Loss: 0.1474, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 11:17:57] (step=0546600) Train Loss: 0.1457, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 11:18:08] (step=0546700) Train Loss: 0.1471, Train Steps/Sec: 9.64, Grad Norm: 0.0059 +[2025-02-21 11:18:16] (step=0546800) Train Loss: 0.1459, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-21 11:18:24] (step=0546900) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 11:18:32] (step=0547000) Train Loss: 0.1460, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 11:18:40] (step=0547100) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 11:18:48] (step=0547200) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 11:18:56] (step=0547300) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 11:19:04] (step=0547400) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 11:19:13] (step=0547500) Train Loss: 0.1464, Train Steps/Sec: 11.42, Grad Norm: 0.0060 +[2025-02-21 11:19:22] (step=0547600) Train Loss: 0.1464, Train Steps/Sec: 10.97, Grad Norm: 0.0061 +[2025-02-21 11:19:30] (step=0547700) Train Loss: 0.1459, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 11:19:38] (step=0547800) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 11:19:46] (step=0547900) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 11:19:56] (step=0548000) Train Loss: 0.1466, Train Steps/Sec: 10.51, Grad Norm: 0.0061 +[2025-02-21 11:20:05] (step=0548100) Train Loss: 0.1458, Train Steps/Sec: 11.25, Grad Norm: 0.0063 +[2025-02-21 11:20:13] (step=0548200) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 11:20:21] (step=0548300) Train Loss: 0.1473, Train Steps/Sec: 12.09, Grad Norm: 0.0057 +[2025-02-21 11:20:29] (step=0548400) Train Loss: 0.1458, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 11:20:37] (step=0548500) Train Loss: 0.1450, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 11:20:45] (step=0548600) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:20:54] (step=0548700) Train Loss: 0.1459, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 11:21:03] (step=0548800) Train Loss: 0.1461, Train Steps/Sec: 11.03, Grad Norm: 0.0058 +[2025-02-21 11:21:12] (step=0548900) Train Loss: 0.1460, Train Steps/Sec: 10.65, Grad Norm: 0.0058 +[2025-02-21 11:21:21] (step=0549000) Train Loss: 0.1466, Train Steps/Sec: 11.70, Grad Norm: 0.0069 +[2025-02-21 11:21:29] (step=0549100) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 11:21:38] (step=0549200) Train Loss: 0.1458, Train Steps/Sec: 10.85, Grad Norm: 0.0061 +[2025-02-21 11:21:46] (step=0549300) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 11:21:54] (step=0549400) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:22:02] (step=0549500) Train Loss: 0.1465, Train Steps/Sec: 12.12, Grad Norm: 0.0058 +[2025-02-21 11:22:11] (step=0549600) Train Loss: 0.1470, Train Steps/Sec: 11.81, Grad Norm: 0.0061 +[2025-02-21 11:22:19] (step=0549700) Train Loss: 0.1470, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 11:22:28] (step=0549800) Train Loss: 0.1469, Train Steps/Sec: 11.37, Grad Norm: 0.0057 +[2025-02-21 11:22:36] (step=0549900) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 11:22:44] (step=0550000) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 11:22:52] (step=0550100) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 11:23:01] (step=0550200) Train Loss: 0.1461, Train Steps/Sec: 11.34, Grad Norm: 0.0057 +[2025-02-21 11:23:09] (step=0550300) Train Loss: 0.1471, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 11:23:18] (step=0550400) Train Loss: 0.1466, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 11:23:27] (step=0550500) Train Loss: 0.1459, Train Steps/Sec: 10.86, Grad Norm: 0.0060 +[2025-02-21 11:23:35] (step=0550600) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:23:43] (step=0550700) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 11:23:51] (step=0550800) Train Loss: 0.1466, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 11:23:59] (step=0550900) Train Loss: 0.1450, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 11:24:09] (step=0551000) Train Loss: 0.1454, Train Steps/Sec: 10.37, Grad Norm: 0.0060 +[2025-02-21 11:24:18] (step=0551100) Train Loss: 0.1457, Train Steps/Sec: 10.65, Grad Norm: 0.0057 +[2025-02-21 11:24:27] (step=0551200) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0057 +[2025-02-21 11:24:35] (step=0551300) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 11:24:43] (step=0551400) Train Loss: 0.1468, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 11:24:51] (step=0551500) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 11:24:59] (step=0551600) Train Loss: 0.1459, Train Steps/Sec: 12.12, Grad Norm: 0.0057 +[2025-02-21 11:25:08] (step=0551700) Train Loss: 0.1452, Train Steps/Sec: 10.85, Grad Norm: 0.0058 +[2025-02-21 11:25:17] (step=0551800) Train Loss: 0.1454, Train Steps/Sec: 11.35, Grad Norm: 0.0063 +[2025-02-21 11:25:26] (step=0551900) Train Loss: 0.1446, Train Steps/Sec: 11.29, Grad Norm: 0.0062 +[2025-02-21 11:25:34] (step=0552000) Train Loss: 0.1449, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 11:25:42] (step=0552100) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:25:50] (step=0552200) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 11:25:58] (step=0552300) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:26:07] (step=0552400) Train Loss: 0.1471, Train Steps/Sec: 11.38, Grad Norm: 0.0061 +[2025-02-21 11:26:15] (step=0552500) Train Loss: 0.1463, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 11:26:24] (step=0552600) Train Loss: 0.1470, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 11:26:32] (step=0552700) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 11:26:40] (step=0552800) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 11:26:48] (step=0552900) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 11:26:57] (step=0553000) Train Loss: 0.1474, Train Steps/Sec: 10.85, Grad Norm: 0.0061 +[2025-02-21 11:27:06] (step=0553100) Train Loss: 0.1448, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 11:27:15] (step=0553200) Train Loss: 0.1466, Train Steps/Sec: 10.68, Grad Norm: 0.0060 +[2025-02-21 11:27:25] (step=0553300) Train Loss: 0.1469, Train Steps/Sec: 10.97, Grad Norm: 0.0062 +[2025-02-21 11:27:33] (step=0553400) Train Loss: 0.1465, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 11:27:41] (step=0553500) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 11:27:49] (step=0553600) Train Loss: 0.1466, Train Steps/Sec: 12.10, Grad Norm: 0.0060 +[2025-02-21 11:27:57] (step=0553700) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 11:28:05] (step=0553800) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 11:28:14] (step=0553900) Train Loss: 0.1464, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 11:28:22] (step=0554000) Train Loss: 0.1462, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 11:28:31] (step=0554100) Train Loss: 0.1447, Train Steps/Sec: 10.99, Grad Norm: 0.0058 +[2025-02-21 11:28:40] (step=0554200) Train Loss: 0.1462, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-21 11:28:48] (step=0554300) Train Loss: 0.1474, Train Steps/Sec: 12.51, Grad Norm: 0.0057 +[2025-02-21 11:28:56] (step=0554400) Train Loss: 0.1473, Train Steps/Sec: 12.50, Grad Norm: 0.0059 +[2025-02-21 11:29:04] (step=0554500) Train Loss: 0.1471, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 11:29:13] (step=0554600) Train Loss: 0.1462, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 11:29:22] (step=0554700) Train Loss: 0.1474, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-21 11:29:30] (step=0554800) Train Loss: 0.1449, Train Steps/Sec: 11.36, Grad Norm: 0.0057 +[2025-02-21 11:29:38] (step=0554900) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:29:46] (step=0555000) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 11:29:54] (step=0555100) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 11:30:02] (step=0555200) Train Loss: 0.1464, Train Steps/Sec: 12.20, Grad Norm: 0.0063 +[2025-02-21 11:30:11] (step=0555300) Train Loss: 0.1453, Train Steps/Sec: 11.01, Grad Norm: 0.0057 +[2025-02-21 11:30:21] (step=0555400) Train Loss: 0.1471, Train Steps/Sec: 11.03, Grad Norm: 0.0061 +[2025-02-21 11:30:31] (step=0555500) Train Loss: 0.1456, Train Steps/Sec: 9.92, Grad Norm: 0.0059 +[2025-02-21 11:30:39] (step=0555600) Train Loss: 0.1460, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 11:30:47] (step=0555700) Train Loss: 0.1433, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 11:30:55] (step=0555800) Train Loss: 0.1473, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 11:31:03] (step=0555900) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 11:31:12] (step=0556000) Train Loss: 0.1469, Train Steps/Sec: 11.82, Grad Norm: 0.0059 +[2025-02-21 11:31:20] (step=0556100) Train Loss: 0.1467, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 11:31:29] (step=0556200) Train Loss: 0.1462, Train Steps/Sec: 11.39, Grad Norm: 0.0063 +[2025-02-21 11:31:37] (step=0556300) Train Loss: 0.1472, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 11:31:45] (step=0556400) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 11:31:53] (step=0556500) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 11:32:01] (step=0556600) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 11:32:10] (step=0556700) Train Loss: 0.1458, Train Steps/Sec: 11.18, Grad Norm: 0.0059 +[2025-02-21 11:32:19] (step=0556800) Train Loss: 0.1469, Train Steps/Sec: 11.31, Grad Norm: 0.0058 +[2025-02-21 11:32:28] (step=0556900) Train Loss: 0.1453, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-21 11:32:36] (step=0557000) Train Loss: 0.1458, Train Steps/Sec: 11.75, Grad Norm: 0.0063 +[2025-02-21 11:32:44] (step=0557100) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 11:32:52] (step=0557200) Train Loss: 0.1482, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 11:33:00] (step=0557300) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:33:08] (step=0557400) Train Loss: 0.1459, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 11:33:17] (step=0557500) Train Loss: 0.1452, Train Steps/Sec: 11.02, Grad Norm: 0.0058 +[2025-02-21 11:33:27] (step=0557600) Train Loss: 0.1450, Train Steps/Sec: 10.06, Grad Norm: 0.0059 +[2025-02-21 11:33:36] (step=0557700) Train Loss: 0.1467, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-21 11:33:44] (step=0557800) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:33:52] (step=0557900) Train Loss: 0.1475, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:34:01] (step=0558000) Train Loss: 0.1449, Train Steps/Sec: 10.51, Grad Norm: 0.0058 +[2025-02-21 11:34:09] (step=0558100) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 11:34:18] (step=0558200) Train Loss: 0.1452, Train Steps/Sec: 11.06, Grad Norm: 0.0061 +[2025-02-21 11:34:26] (step=0558300) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 11:34:35] (step=0558400) Train Loss: 0.1464, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 11:34:43] (step=0558500) Train Loss: 0.1460, Train Steps/Sec: 12.11, Grad Norm: 0.0063 +[2025-02-21 11:34:51] (step=0558600) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 11:34:59] (step=0558700) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:35:07] (step=0558800) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 11:35:15] (step=0558900) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 11:35:24] (step=0559000) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 11:35:33] (step=0559100) Train Loss: 0.1455, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 11:35:42] (step=0559200) Train Loss: 0.1455, Train Steps/Sec: 10.56, Grad Norm: 0.0057 +[2025-02-21 11:35:50] (step=0559300) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 11:35:58] (step=0559400) Train Loss: 0.1474, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 11:36:06] (step=0559500) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 11:36:14] (step=0559600) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 11:36:23] (step=0559700) Train Loss: 0.1469, Train Steps/Sec: 10.99, Grad Norm: 0.0059 +[2025-02-21 11:36:33] (step=0559800) Train Loss: 0.1460, Train Steps/Sec: 10.07, Grad Norm: 0.0057 +[2025-02-21 11:36:42] (step=0559900) Train Loss: 0.1461, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 11:36:50] (step=0560000) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 11:36:52] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0560000.pt +[2025-02-21 11:37:00] (step=0560100) Train Loss: 0.1466, Train Steps/Sec: 9.42, Grad Norm: 0.0059 +[2025-02-21 11:37:09] (step=0560200) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 11:37:17] (step=0560300) Train Loss: 0.1469, Train Steps/Sec: 11.81, Grad Norm: 0.0059 +[2025-02-21 11:37:26] (step=0560400) Train Loss: 0.1465, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 11:37:35] (step=0560500) Train Loss: 0.1460, Train Steps/Sec: 10.16, Grad Norm: 0.0057 +[2025-02-21 11:37:44] (step=0560600) Train Loss: 0.1482, Train Steps/Sec: 12.06, Grad Norm: 0.0062 +[2025-02-21 11:37:52] (step=0560700) Train Loss: 0.1465, Train Steps/Sec: 12.52, Grad Norm: 0.0060 +[2025-02-21 11:38:00] (step=0560800) Train Loss: 0.1463, Train Steps/Sec: 12.53, Grad Norm: 0.0059 +[2025-02-21 11:38:08] (step=0560900) Train Loss: 0.1468, Train Steps/Sec: 12.52, Grad Norm: 0.0058 +[2025-02-21 11:38:16] (step=0561000) Train Loss: 0.1466, Train Steps/Sec: 12.53, Grad Norm: 0.0058 +[2025-02-21 11:38:24] (step=0561100) Train Loss: 0.1471, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 11:38:32] (step=0561200) Train Loss: 0.1472, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 11:38:42] (step=0561300) Train Loss: 0.1464, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-21 11:38:50] (step=0561400) Train Loss: 0.1455, Train Steps/Sec: 12.06, Grad Norm: 0.0061 +[2025-02-21 11:38:58] (step=0561500) Train Loss: 0.1444, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 11:39:06] (step=0561600) Train Loss: 0.1466, Train Steps/Sec: 12.52, Grad Norm: 0.0059 +[2025-02-21 11:39:15] (step=0561700) Train Loss: 0.1468, Train Steps/Sec: 11.17, Grad Norm: 0.0062 +[2025-02-21 11:39:23] (step=0561800) Train Loss: 0.1453, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 11:39:32] (step=0561900) Train Loss: 0.1465, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 11:39:42] (step=0562000) Train Loss: 0.1456, Train Steps/Sec: 10.34, Grad Norm: 0.0058 +[2025-02-21 11:39:51] (step=0562100) Train Loss: 0.1456, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 11:39:59] (step=0562200) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 11:40:07] (step=0562300) Train Loss: 0.1449, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 11:40:15] (step=0562400) Train Loss: 0.1451, Train Steps/Sec: 12.21, Grad Norm: 0.0060 +[2025-02-21 11:40:24] (step=0562500) Train Loss: 0.1470, Train Steps/Sec: 11.45, Grad Norm: 0.0061 +[2025-02-21 11:40:33] (step=0562600) Train Loss: 0.1459, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 11:40:41] (step=0562700) Train Loss: 0.1455, Train Steps/Sec: 12.16, Grad Norm: 0.0057 +[2025-02-21 11:40:49] (step=0562800) Train Loss: 0.1458, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 11:40:57] (step=0562900) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 11:41:06] (step=0563000) Train Loss: 0.1468, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-21 11:41:14] (step=0563100) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 11:41:22] (step=0563200) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 11:41:30] (step=0563300) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 11:41:39] (step=0563400) Train Loss: 0.1457, Train Steps/Sec: 10.98, Grad Norm: 0.0062 +[2025-02-21 11:41:48] (step=0563500) Train Loss: 0.1460, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 11:41:56] (step=0563600) Train Loss: 0.1467, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 11:42:04] (step=0563700) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 11:42:12] (step=0563800) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:42:20] (step=0563900) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 11:42:28] (step=0564000) Train Loss: 0.1460, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 11:42:37] (step=0564100) Train Loss: 0.1462, Train Steps/Sec: 11.02, Grad Norm: 0.0057 +[2025-02-21 11:42:47] (step=0564200) Train Loss: 0.1462, Train Steps/Sec: 10.29, Grad Norm: 0.0059 +[2025-02-21 11:42:57] (step=0564300) Train Loss: 0.1465, Train Steps/Sec: 10.53, Grad Norm: 0.0061 +[2025-02-21 11:43:04] (step=0564400) Train Loss: 0.1467, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 11:43:13] (step=0564500) Train Loss: 0.1464, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-21 11:43:22] (step=0564600) Train Loss: 0.1446, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 11:43:30] (step=0564700) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 11:43:39] (step=0564800) Train Loss: 0.1458, Train Steps/Sec: 11.72, Grad Norm: 0.0057 +[2025-02-21 11:43:47] (step=0564900) Train Loss: 0.1453, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 11:43:55] (step=0565000) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 11:44:03] (step=0565100) Train Loss: 0.1452, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 11:44:11] (step=0565200) Train Loss: 0.1445, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 11:44:19] (step=0565300) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 11:44:27] (step=0565400) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 11:44:36] (step=0565500) Train Loss: 0.1471, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-21 11:44:45] (step=0565600) Train Loss: 0.1461, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-21 11:44:54] (step=0565700) Train Loss: 0.1477, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 11:45:02] (step=0565800) Train Loss: 0.1451, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 11:45:10] (step=0565900) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 11:45:18] (step=0566000) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 11:45:26] (step=0566100) Train Loss: 0.1448, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 11:45:35] (step=0566200) Train Loss: 0.1452, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 11:45:44] (step=0566300) Train Loss: 0.1462, Train Steps/Sec: 10.41, Grad Norm: 0.0061 +[2025-02-21 11:45:53] (step=0566400) Train Loss: 0.1459, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-21 11:46:02] (step=0566500) Train Loss: 0.1449, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-21 11:46:10] (step=0566600) Train Loss: 0.1458, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 11:46:19] (step=0566700) Train Loss: 0.1463, Train Steps/Sec: 11.37, Grad Norm: 0.0056 +[2025-02-21 11:46:28] (step=0566800) Train Loss: 0.1468, Train Steps/Sec: 10.22, Grad Norm: 0.0063 +[2025-02-21 11:46:37] (step=0566900) Train Loss: 0.1469, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 11:46:45] (step=0567000) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:46:53] (step=0567100) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:47:01] (step=0567200) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 11:47:09] (step=0567300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 11:47:17] (step=0567400) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 11:47:25] (step=0567500) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 11:47:33] (step=0567600) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 11:47:41] (step=0567700) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 11:47:50] (step=0567800) Train Loss: 0.1453, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-21 11:47:59] (step=0567900) Train Loss: 0.1469, Train Steps/Sec: 11.27, Grad Norm: 0.0057 +[2025-02-21 11:48:08] (step=0568000) Train Loss: 0.1450, Train Steps/Sec: 10.86, Grad Norm: 0.0059 +[2025-02-21 11:48:16] (step=0568100) Train Loss: 0.1455, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 11:48:24] (step=0568200) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 11:48:32] (step=0568300) Train Loss: 0.1464, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 11:48:41] (step=0568400) Train Loss: 0.1459, Train Steps/Sec: 11.72, Grad Norm: 0.0057 +[2025-02-21 11:48:51] (step=0568500) Train Loss: 0.1472, Train Steps/Sec: 9.79, Grad Norm: 0.0059 +[2025-02-21 11:48:59] (step=0568600) Train Loss: 0.1457, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 11:49:08] (step=0568700) Train Loss: 0.1463, Train Steps/Sec: 11.34, Grad Norm: 0.0058 +[2025-02-21 11:49:16] (step=0568800) Train Loss: 0.1465, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 11:49:25] (step=0568900) Train Loss: 0.1469, Train Steps/Sec: 11.02, Grad Norm: 0.0060 +[2025-02-21 11:49:34] (step=0569000) Train Loss: 0.1456, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 11:49:42] (step=0569100) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 11:49:50] (step=0569200) Train Loss: 0.1478, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 11:49:59] (step=0569300) Train Loss: 0.1472, Train Steps/Sec: 10.90, Grad Norm: 0.0056 +[2025-02-21 11:50:07] (step=0569400) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 11:50:15] (step=0569500) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 11:50:23] (step=0569600) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 11:50:31] (step=0569700) Train Loss: 0.1480, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 11:50:39] (step=0569800) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 11:50:47] (step=0569900) Train Loss: 0.1449, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 11:50:56] (step=0570000) Train Loss: 0.1466, Train Steps/Sec: 11.75, Grad Norm: 0.0064 +[2025-02-21 11:51:05] (step=0570100) Train Loss: 0.1468, Train Steps/Sec: 10.97, Grad Norm: 0.0058 +[2025-02-21 11:51:13] (step=0570200) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 11:51:21] (step=0570300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 11:51:29] (step=0570400) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 11:51:38] (step=0570500) Train Loss: 0.1478, Train Steps/Sec: 10.56, Grad Norm: 0.0059 +[2025-02-21 11:51:47] (step=0570600) Train Loss: 0.1464, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 11:51:57] (step=0570700) Train Loss: 0.1456, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-21 11:52:06] (step=0570800) Train Loss: 0.1460, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 11:52:14] (step=0570900) Train Loss: 0.1469, Train Steps/Sec: 12.22, Grad Norm: 0.0060 +[2025-02-21 11:52:22] (step=0571000) Train Loss: 0.1464, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 11:52:31] (step=0571100) Train Loss: 0.1452, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 11:52:40] (step=0571200) Train Loss: 0.1468, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 11:52:48] (step=0571300) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 11:52:56] (step=0571400) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 11:53:04] (step=0571500) Train Loss: 0.1469, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 11:53:12] (step=0571600) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:53:20] (step=0571700) Train Loss: 0.1469, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 11:53:29] (step=0571800) Train Loss: 0.1464, Train Steps/Sec: 11.16, Grad Norm: 0.0061 +[2025-02-21 11:53:37] (step=0571900) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 11:53:45] (step=0572000) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:53:53] (step=0572100) Train Loss: 0.1467, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 11:54:02] (step=0572200) Train Loss: 0.1458, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 11:54:11] (step=0572300) Train Loss: 0.1463, Train Steps/Sec: 10.97, Grad Norm: 0.0062 +[2025-02-21 11:54:19] (step=0572400) Train Loss: 0.1450, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 11:54:27] (step=0572500) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:54:35] (step=0572600) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 11:54:43] (step=0572700) Train Loss: 0.1470, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 11:54:52] (step=0572800) Train Loss: 0.1465, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 11:55:02] (step=0572900) Train Loss: 0.1454, Train Steps/Sec: 10.06, Grad Norm: 0.0060 +[2025-02-21 11:55:12] (step=0573000) Train Loss: 0.1472, Train Steps/Sec: 9.89, Grad Norm: 0.0060 +[2025-02-21 11:55:20] (step=0573100) Train Loss: 0.1460, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 11:55:29] (step=0573200) Train Loss: 0.1448, Train Steps/Sec: 10.97, Grad Norm: 0.0057 +[2025-02-21 11:55:38] (step=0573300) Train Loss: 0.1469, Train Steps/Sec: 11.71, Grad Norm: 0.0064 +[2025-02-21 11:55:46] (step=0573400) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 11:55:54] (step=0573500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 11:56:02] (step=0573600) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 11:56:10] (step=0573700) Train Loss: 0.1455, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 11:56:18] (step=0573800) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0056 +[2025-02-21 11:56:26] (step=0573900) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 11:56:34] (step=0574000) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 11:56:42] (step=0574100) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 11:56:50] (step=0574200) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 11:56:59] (step=0574300) Train Loss: 0.1461, Train Steps/Sec: 10.28, Grad Norm: 0.0057 +[2025-02-21 11:57:08] (step=0574400) Train Loss: 0.1471, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 11:57:17] (step=0574500) Train Loss: 0.1464, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 11:57:25] (step=0574600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 11:57:33] (step=0574700) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 11:57:40] (step=0574800) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 11:57:49] (step=0574900) Train Loss: 0.1461, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 11:57:58] (step=0575000) Train Loss: 0.1453, Train Steps/Sec: 11.04, Grad Norm: 0.0057 +[2025-02-21 11:58:08] (step=0575100) Train Loss: 0.1449, Train Steps/Sec: 10.07, Grad Norm: 0.0057 +[2025-02-21 11:58:17] (step=0575200) Train Loss: 0.1472, Train Steps/Sec: 11.33, Grad Norm: 0.0062 +[2025-02-21 11:58:26] (step=0575300) Train Loss: 0.1464, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 11:58:34] (step=0575400) Train Loss: 0.1458, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 11:58:44] (step=0575500) Train Loss: 0.1469, Train Steps/Sec: 10.49, Grad Norm: 0.0060 +[2025-02-21 11:58:52] (step=0575600) Train Loss: 0.1464, Train Steps/Sec: 12.50, Grad Norm: 0.0059 +[2025-02-21 11:59:00] (step=0575700) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 11:59:08] (step=0575800) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 11:59:16] (step=0575900) Train Loss: 0.1461, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 11:59:24] (step=0576000) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 11:59:32] (step=0576100) Train Loss: 0.1452, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 11:59:40] (step=0576200) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 11:59:48] (step=0576300) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 11:59:56] (step=0576400) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 12:00:04] (step=0576500) Train Loss: 0.1474, Train Steps/Sec: 11.31, Grad Norm: 0.0058 +[2025-02-21 12:00:13] (step=0576600) Train Loss: 0.1466, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 12:00:22] (step=0576700) Train Loss: 0.1463, Train Steps/Sec: 11.30, Grad Norm: 0.0057 +[2025-02-21 12:00:31] (step=0576800) Train Loss: 0.1461, Train Steps/Sec: 11.26, Grad Norm: 0.0058 +[2025-02-21 12:00:39] (step=0576900) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 12:00:47] (step=0577000) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 12:00:55] (step=0577100) Train Loss: 0.1454, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 12:01:05] (step=0577200) Train Loss: 0.1455, Train Steps/Sec: 10.41, Grad Norm: 0.0059 +[2025-02-21 12:01:14] (step=0577300) Train Loss: 0.1459, Train Steps/Sec: 10.66, Grad Norm: 0.0057 +[2025-02-21 12:01:23] (step=0577400) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0056 +[2025-02-21 12:01:32] (step=0577500) Train Loss: 0.1466, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 12:01:41] (step=0577600) Train Loss: 0.1457, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 12:01:48] (step=0577700) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 12:01:56] (step=0577800) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 12:02:04] (step=0577900) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 12:02:13] (step=0578000) Train Loss: 0.1467, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 12:02:21] (step=0578100) Train Loss: 0.1468, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 12:02:29] (step=0578200) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 12:02:37] (step=0578300) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 12:02:45] (step=0578400) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 12:02:53] (step=0578500) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 12:03:01] (step=0578600) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 12:03:10] (step=0578700) Train Loss: 0.1469, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 12:03:19] (step=0578800) Train Loss: 0.1472, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-21 12:03:27] (step=0578900) Train Loss: 0.1473, Train Steps/Sec: 11.32, Grad Norm: 0.0065 +[2025-02-21 12:03:35] (step=0579000) Train Loss: 0.1451, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 12:03:43] (step=0579100) Train Loss: 0.1474, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 12:03:51] (step=0579200) Train Loss: 0.1464, Train Steps/Sec: 12.51, Grad Norm: 0.0059 +[2025-02-21 12:04:01] (step=0579300) Train Loss: 0.1457, Train Steps/Sec: 10.24, Grad Norm: 0.0058 +[2025-02-21 12:04:11] (step=0579400) Train Loss: 0.1463, Train Steps/Sec: 10.40, Grad Norm: 0.0060 +[2025-02-21 12:04:20] (step=0579500) Train Loss: 0.1460, Train Steps/Sec: 10.35, Grad Norm: 0.0061 +[2025-02-21 12:04:29] (step=0579600) Train Loss: 0.1457, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 12:04:38] (step=0579700) Train Loss: 0.1473, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-21 12:04:46] (step=0579800) Train Loss: 0.1465, Train Steps/Sec: 12.10, Grad Norm: 0.0064 +[2025-02-21 12:04:54] (step=0579900) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 12:05:02] (step=0580000) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:05:05] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0580000.pt +[2025-02-21 12:05:13] (step=0580100) Train Loss: 0.1455, Train Steps/Sec: 8.98, Grad Norm: 0.0065 +[2025-02-21 12:05:21] (step=0580200) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 12:05:29] (step=0580300) Train Loss: 0.1454, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 12:05:37] (step=0580400) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 12:05:46] (step=0580500) Train Loss: 0.1458, Train Steps/Sec: 11.21, Grad Norm: 0.0058 +[2025-02-21 12:05:54] (step=0580600) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 12:06:02] (step=0580700) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 12:06:10] (step=0580800) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 12:06:19] (step=0580900) Train Loss: 0.1458, Train Steps/Sec: 10.96, Grad Norm: 0.0063 +[2025-02-21 12:06:28] (step=0581000) Train Loss: 0.1464, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 12:06:36] (step=0581100) Train Loss: 0.1456, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 12:06:44] (step=0581200) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:06:52] (step=0581300) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 12:07:00] (step=0581400) Train Loss: 0.1465, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 12:07:09] (step=0581500) Train Loss: 0.1458, Train Steps/Sec: 11.73, Grad Norm: 0.0057 +[2025-02-21 12:07:19] (step=0581600) Train Loss: 0.1460, Train Steps/Sec: 10.33, Grad Norm: 0.0059 +[2025-02-21 12:07:29] (step=0581700) Train Loss: 0.1464, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-21 12:07:39] (step=0581800) Train Loss: 0.1454, Train Steps/Sec: 9.87, Grad Norm: 0.0058 +[2025-02-21 12:07:47] (step=0581900) Train Loss: 0.1462, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 12:07:55] (step=0582000) Train Loss: 0.1475, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 12:08:03] (step=0582100) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 12:08:11] (step=0582200) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:08:19] (step=0582300) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 12:08:27] (step=0582400) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 12:08:35] (step=0582500) Train Loss: 0.1473, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 12:08:43] (step=0582600) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0066 +[2025-02-21 12:08:51] (step=0582700) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 12:08:59] (step=0582800) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 12:09:07] (step=0582900) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:09:16] (step=0583000) Train Loss: 0.1463, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-21 12:09:25] (step=0583100) Train Loss: 0.1451, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 12:09:34] (step=0583200) Train Loss: 0.1452, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 12:09:42] (step=0583300) Train Loss: 0.1457, Train Steps/Sec: 12.12, Grad Norm: 0.0064 +[2025-02-21 12:09:50] (step=0583400) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 12:09:58] (step=0583500) Train Loss: 0.1444, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 12:10:07] (step=0583600) Train Loss: 0.1460, Train Steps/Sec: 11.79, Grad Norm: 0.0064 +[2025-02-21 12:10:15] (step=0583700) Train Loss: 0.1466, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 12:10:26] (step=0583800) Train Loss: 0.1462, Train Steps/Sec: 9.59, Grad Norm: 0.0058 +[2025-02-21 12:10:35] (step=0583900) Train Loss: 0.1468, Train Steps/Sec: 10.65, Grad Norm: 0.0059 +[2025-02-21 12:10:44] (step=0584000) Train Loss: 0.1464, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 12:10:52] (step=0584100) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 12:11:00] (step=0584200) Train Loss: 0.1469, Train Steps/Sec: 12.09, Grad Norm: 0.0058 +[2025-02-21 12:11:09] (step=0584300) Train Loss: 0.1457, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-21 12:11:17] (step=0584400) Train Loss: 0.1465, Train Steps/Sec: 12.53, Grad Norm: 0.0060 +[2025-02-21 12:11:25] (step=0584500) Train Loss: 0.1440, Train Steps/Sec: 12.53, Grad Norm: 0.0063 +[2025-02-21 12:11:33] (step=0584600) Train Loss: 0.1460, Train Steps/Sec: 12.09, Grad Norm: 0.0066 +[2025-02-21 12:11:41] (step=0584700) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 12:11:49] (step=0584800) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 12:11:57] (step=0584900) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 12:12:05] (step=0585000) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 12:12:13] (step=0585100) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 12:12:21] (step=0585200) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 12:12:30] (step=0585300) Train Loss: 0.1453, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 12:12:38] (step=0585400) Train Loss: 0.1460, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 12:12:48] (step=0585500) Train Loss: 0.1461, Train Steps/Sec: 10.51, Grad Norm: 0.0058 +[2025-02-21 12:12:56] (step=0585600) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 12:13:04] (step=0585700) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 12:13:12] (step=0585800) Train Loss: 0.1477, Train Steps/Sec: 11.79, Grad Norm: 0.0063 +[2025-02-21 12:13:22] (step=0585900) Train Loss: 0.1463, Train Steps/Sec: 10.66, Grad Norm: 0.0060 +[2025-02-21 12:13:32] (step=0586000) Train Loss: 0.1463, Train Steps/Sec: 9.54, Grad Norm: 0.0058 +[2025-02-21 12:13:41] (step=0586100) Train Loss: 0.1463, Train Steps/Sec: 10.65, Grad Norm: 0.0061 +[2025-02-21 12:13:49] (step=0586200) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 12:13:57] (step=0586300) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 12:14:06] (step=0586400) Train Loss: 0.1452, Train Steps/Sec: 12.13, Grad Norm: 0.0057 +[2025-02-21 12:14:13] (step=0586500) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:14:21] (step=0586600) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 12:14:29] (step=0586700) Train Loss: 0.1467, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 12:14:39] (step=0586800) Train Loss: 0.1470, Train Steps/Sec: 10.86, Grad Norm: 0.0058 +[2025-02-21 12:14:47] (step=0586900) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 12:14:54] (step=0587000) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 12:15:02] (step=0587100) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 12:15:10] (step=0587200) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:15:18] (step=0587300) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 12:15:26] (step=0587400) Train Loss: 0.1457, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 12:15:35] (step=0587500) Train Loss: 0.1464, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 12:15:44] (step=0587600) Train Loss: 0.1468, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 12:15:52] (step=0587700) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 12:16:00] (step=0587800) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 12:16:08] (step=0587900) Train Loss: 0.1455, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 12:16:18] (step=0588000) Train Loss: 0.1470, Train Steps/Sec: 10.25, Grad Norm: 0.0059 +[2025-02-21 12:16:28] (step=0588100) Train Loss: 0.1466, Train Steps/Sec: 10.35, Grad Norm: 0.0060 +[2025-02-21 12:16:38] (step=0588200) Train Loss: 0.1454, Train Steps/Sec: 9.45, Grad Norm: 0.0062 +[2025-02-21 12:16:47] (step=0588300) Train Loss: 0.1477, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 12:16:55] (step=0588400) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:17:03] (step=0588500) Train Loss: 0.1455, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 12:17:11] (step=0588600) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 12:17:19] (step=0588700) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 12:17:27] (step=0588800) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0056 +[2025-02-21 12:17:35] (step=0588900) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:17:43] (step=0589000) Train Loss: 0.1459, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 12:17:51] (step=0589100) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 12:17:59] (step=0589200) Train Loss: 0.1456, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 12:18:08] (step=0589300) Train Loss: 0.1462, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-21 12:18:16] (step=0589400) Train Loss: 0.1462, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 12:18:24] (step=0589500) Train Loss: 0.1461, Train Steps/Sec: 12.52, Grad Norm: 0.0059 +[2025-02-21 12:18:32] (step=0589600) Train Loss: 0.1466, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-21 12:18:41] (step=0589700) Train Loss: 0.1453, Train Steps/Sec: 11.69, Grad Norm: 0.0064 +[2025-02-21 12:18:50] (step=0589800) Train Loss: 0.1459, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 12:18:58] (step=0589900) Train Loss: 0.1475, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 12:19:06] (step=0590000) Train Loss: 0.1483, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 12:19:14] (step=0590100) Train Loss: 0.1456, Train Steps/Sec: 12.23, Grad Norm: 0.0062 +[2025-02-21 12:19:23] (step=0590200) Train Loss: 0.1476, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 12:19:33] (step=0590300) Train Loss: 0.1461, Train Steps/Sec: 10.71, Grad Norm: 0.0061 +[2025-02-21 12:19:43] (step=0590400) Train Loss: 0.1455, Train Steps/Sec: 9.25, Grad Norm: 0.0059 +[2025-02-21 12:19:53] (step=0590500) Train Loss: 0.1465, Train Steps/Sec: 10.86, Grad Norm: 0.0061 +[2025-02-21 12:20:00] (step=0590600) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 12:20:09] (step=0590700) Train Loss: 0.1464, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 12:20:17] (step=0590800) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 12:20:25] (step=0590900) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:20:32] (step=0591000) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 12:20:40] (step=0591100) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:20:49] (step=0591200) Train Loss: 0.1465, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 12:20:57] (step=0591300) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 12:21:05] (step=0591400) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 12:21:12] (step=0591500) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 12:21:20] (step=0591600) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:21:28] (step=0591700) Train Loss: 0.1461, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 12:21:38] (step=0591800) Train Loss: 0.1448, Train Steps/Sec: 10.49, Grad Norm: 0.0059 +[2025-02-21 12:21:46] (step=0591900) Train Loss: 0.1457, Train Steps/Sec: 11.63, Grad Norm: 0.0059 +[2025-02-21 12:21:55] (step=0592000) Train Loss: 0.1456, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 12:22:04] (step=0592100) Train Loss: 0.1462, Train Steps/Sec: 12.13, Grad Norm: 0.0056 +[2025-02-21 12:22:12] (step=0592200) Train Loss: 0.1460, Train Steps/Sec: 12.19, Grad Norm: 0.0056 +[2025-02-21 12:22:20] (step=0592300) Train Loss: 0.1469, Train Steps/Sec: 11.77, Grad Norm: 0.0064 +[2025-02-21 12:22:29] (step=0592400) Train Loss: 0.1449, Train Steps/Sec: 11.02, Grad Norm: 0.0062 +[2025-02-21 12:22:40] (step=0592500) Train Loss: 0.1467, Train Steps/Sec: 9.79, Grad Norm: 0.0062 +[2025-02-21 12:22:49] (step=0592600) Train Loss: 0.1462, Train Steps/Sec: 10.66, Grad Norm: 0.0058 +[2025-02-21 12:22:57] (step=0592700) Train Loss: 0.1453, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 12:23:05] (step=0592800) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 12:23:14] (step=0592900) Train Loss: 0.1447, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 12:23:23] (step=0593000) Train Loss: 0.1466, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 12:23:30] (step=0593100) Train Loss: 0.1458, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 12:23:38] (step=0593200) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 12:23:46] (step=0593300) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 12:23:55] (step=0593400) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 12:24:03] (step=0593500) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:24:10] (step=0593600) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:24:18] (step=0593700) Train Loss: 0.1477, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:24:26] (step=0593800) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 12:24:34] (step=0593900) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 12:24:43] (step=0594000) Train Loss: 0.1456, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 12:24:51] (step=0594100) Train Loss: 0.1455, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 12:25:00] (step=0594200) Train Loss: 0.1464, Train Steps/Sec: 11.27, Grad Norm: 0.0057 +[2025-02-21 12:25:10] (step=0594300) Train Loss: 0.1445, Train Steps/Sec: 10.58, Grad Norm: 0.0057 +[2025-02-21 12:25:18] (step=0594400) Train Loss: 0.1474, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 12:25:27] (step=0594500) Train Loss: 0.1473, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 12:25:36] (step=0594600) Train Loss: 0.1459, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 12:25:46] (step=0594700) Train Loss: 0.1461, Train Steps/Sec: 10.32, Grad Norm: 0.0059 +[2025-02-21 12:25:55] (step=0594800) Train Loss: 0.1462, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 12:26:03] (step=0594900) Train Loss: 0.1464, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 12:26:11] (step=0595000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:26:20] (step=0595100) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 12:26:28] (step=0595200) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 12:26:35] (step=0595300) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 12:26:43] (step=0595400) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 12:26:53] (step=0595500) Train Loss: 0.1463, Train Steps/Sec: 10.82, Grad Norm: 0.0059 +[2025-02-21 12:27:01] (step=0595600) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 12:27:09] (step=0595700) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 12:27:17] (step=0595800) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 12:27:24] (step=0595900) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:27:32] (step=0596000) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 12:27:40] (step=0596100) Train Loss: 0.1457, Train Steps/Sec: 12.53, Grad Norm: 0.0059 +[2025-02-21 12:27:49] (step=0596200) Train Loss: 0.1466, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 12:27:57] (step=0596300) Train Loss: 0.1460, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-21 12:28:06] (step=0596400) Train Loss: 0.1471, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 12:28:15] (step=0596500) Train Loss: 0.1467, Train Steps/Sec: 11.74, Grad Norm: 0.0066 +[2025-02-21 12:28:23] (step=0596600) Train Loss: 0.1448, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 12:28:32] (step=0596700) Train Loss: 0.1467, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-21 12:28:43] (step=0596800) Train Loss: 0.1450, Train Steps/Sec: 9.68, Grad Norm: 0.0063 +[2025-02-21 12:28:52] (step=0596900) Train Loss: 0.1463, Train Steps/Sec: 10.35, Grad Norm: 0.0061 +[2025-02-21 12:29:01] (step=0597000) Train Loss: 0.1475, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-21 12:29:10] (step=0597100) Train Loss: 0.1457, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 12:29:18] (step=0597200) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:29:26] (step=0597300) Train Loss: 0.1457, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 12:29:34] (step=0597400) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:29:42] (step=0597500) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:29:50] (step=0597600) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:29:58] (step=0597700) Train Loss: 0.1463, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 12:30:06] (step=0597800) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 12:30:14] (step=0597900) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 12:30:23] (step=0598000) Train Loss: 0.1478, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-21 12:30:31] (step=0598100) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:30:38] (step=0598200) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 12:30:46] (step=0598300) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 12:30:55] (step=0598400) Train Loss: 0.1456, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 12:31:04] (step=0598500) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 12:31:13] (step=0598600) Train Loss: 0.1462, Train Steps/Sec: 11.38, Grad Norm: 0.0058 +[2025-02-21 12:31:21] (step=0598700) Train Loss: 0.1456, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 12:31:30] (step=0598800) Train Loss: 0.1462, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 12:31:39] (step=0598900) Train Loss: 0.1453, Train Steps/Sec: 11.01, Grad Norm: 0.0064 +[2025-02-21 12:31:47] (step=0599000) Train Loss: 0.1471, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 12:31:57] (step=0599100) Train Loss: 0.1462, Train Steps/Sec: 10.07, Grad Norm: 0.0060 +[2025-02-21 12:32:06] (step=0599200) Train Loss: 0.1466, Train Steps/Sec: 11.28, Grad Norm: 0.0058 +[2025-02-21 12:32:15] (step=0599300) Train Loss: 0.1453, Train Steps/Sec: 10.90, Grad Norm: 0.0057 +[2025-02-21 12:32:23] (step=0599400) Train Loss: 0.1469, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 12:32:32] (step=0599500) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 12:32:40] (step=0599600) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 12:32:48] (step=0599700) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 12:32:55] (step=0599800) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 12:33:04] (step=0599900) Train Loss: 0.1451, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 12:33:12] (step=0600000) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:33:16] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0600000.pt +[2025-02-21 12:33:24] (step=0600100) Train Loss: 0.1463, Train Steps/Sec: 7.85, Grad Norm: 0.0060 +[2025-02-21 12:33:32] (step=0600200) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:33:40] (step=0600300) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 12:33:48] (step=0600400) Train Loss: 0.1480, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 12:33:57] (step=0600500) Train Loss: 0.1453, Train Steps/Sec: 11.21, Grad Norm: 0.0062 +[2025-02-21 12:34:06] (step=0600600) Train Loss: 0.1466, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 12:34:15] (step=0600700) Train Loss: 0.1464, Train Steps/Sec: 11.34, Grad Norm: 0.0057 +[2025-02-21 12:34:24] (step=0600800) Train Loss: 0.1453, Train Steps/Sec: 11.76, Grad Norm: 0.0062 +[2025-02-21 12:34:33] (step=0600900) Train Loss: 0.1463, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-21 12:34:41] (step=0601000) Train Loss: 0.1456, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 12:34:50] (step=0601100) Train Loss: 0.1443, Train Steps/Sec: 11.01, Grad Norm: 0.0057 +[2025-02-21 12:34:59] (step=0601200) Train Loss: 0.1472, Train Steps/Sec: 11.77, Grad Norm: 0.0063 +[2025-02-21 12:35:09] (step=0601300) Train Loss: 0.1450, Train Steps/Sec: 10.06, Grad Norm: 0.0060 +[2025-02-21 12:35:17] (step=0601400) Train Loss: 0.1449, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 12:35:25] (step=0601500) Train Loss: 0.1449, Train Steps/Sec: 12.12, Grad Norm: 0.0057 +[2025-02-21 12:35:33] (step=0601600) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 12:35:41] (step=0601700) Train Loss: 0.1458, Train Steps/Sec: 12.08, Grad Norm: 0.0061 +[2025-02-21 12:35:50] (step=0601800) Train Loss: 0.1452, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 12:35:58] (step=0601900) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 12:36:06] (step=0602000) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 12:36:14] (step=0602100) Train Loss: 0.1457, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 12:36:22] (step=0602200) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 12:36:30] (step=0602300) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 12:36:38] (step=0602400) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:36:46] (step=0602500) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:36:54] (step=0602600) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 12:37:02] (step=0602700) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 12:37:11] (step=0602800) Train Loss: 0.1470, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-21 12:37:20] (step=0602900) Train Loss: 0.1460, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 12:37:30] (step=0603000) Train Loss: 0.1465, Train Steps/Sec: 9.62, Grad Norm: 0.0063 +[2025-02-21 12:37:38] (step=0603100) Train Loss: 0.1459, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 12:37:47] (step=0603200) Train Loss: 0.1474, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 12:37:56] (step=0603300) Train Loss: 0.1459, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 12:38:04] (step=0603400) Train Loss: 0.1464, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 12:38:14] (step=0603500) Train Loss: 0.1453, Train Steps/Sec: 10.64, Grad Norm: 0.0060 +[2025-02-21 12:38:22] (step=0603600) Train Loss: 0.1455, Train Steps/Sec: 11.75, Grad Norm: 0.0064 +[2025-02-21 12:38:31] (step=0603700) Train Loss: 0.1465, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 12:38:39] (step=0603800) Train Loss: 0.1474, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 12:38:47] (step=0603900) Train Loss: 0.1444, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 12:38:55] (step=0604000) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 12:39:03] (step=0604100) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 12:39:11] (step=0604200) Train Loss: 0.1450, Train Steps/Sec: 12.13, Grad Norm: 0.0057 +[2025-02-21 12:39:20] (step=0604300) Train Loss: 0.1456, Train Steps/Sec: 11.22, Grad Norm: 0.0058 +[2025-02-21 12:39:28] (step=0604400) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:39:36] (step=0604500) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0066 +[2025-02-21 12:39:44] (step=0604600) Train Loss: 0.1455, Train Steps/Sec: 12.55, Grad Norm: 0.0063 +[2025-02-21 12:39:51] (step=0604700) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 12:39:59] (step=0604800) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 12:40:07] (step=0604900) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 12:40:17] (step=0605000) Train Loss: 0.1465, Train Steps/Sec: 10.66, Grad Norm: 0.0058 +[2025-02-21 12:40:25] (step=0605100) Train Loss: 0.1473, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 12:40:35] (step=0605200) Train Loss: 0.1462, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 12:40:43] (step=0605300) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 12:40:52] (step=0605400) Train Loss: 0.1464, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 12:41:01] (step=0605500) Train Loss: 0.1452, Train Steps/Sec: 10.89, Grad Norm: 0.0063 +[2025-02-21 12:41:10] (step=0605600) Train Loss: 0.1468, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 12:41:20] (step=0605700) Train Loss: 0.1465, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 12:41:28] (step=0605800) Train Loss: 0.1455, Train Steps/Sec: 11.70, Grad Norm: 0.0064 +[2025-02-21 12:41:36] (step=0605900) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 12:41:44] (step=0606000) Train Loss: 0.1456, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 12:41:52] (step=0606100) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 12:42:00] (step=0606200) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 12:42:08] (step=0606300) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 12:42:16] (step=0606400) Train Loss: 0.1459, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 12:42:24] (step=0606500) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 12:42:32] (step=0606600) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 12:42:40] (step=0606700) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 12:42:49] (step=0606800) Train Loss: 0.1457, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 12:42:57] (step=0606900) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 12:43:05] (step=0607000) Train Loss: 0.1472, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 12:43:13] (step=0607100) Train Loss: 0.1454, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 12:43:22] (step=0607200) Train Loss: 0.1469, Train Steps/Sec: 10.96, Grad Norm: 0.0058 +[2025-02-21 12:43:32] (step=0607300) Train Loss: 0.1452, Train Steps/Sec: 10.32, Grad Norm: 0.0059 +[2025-02-21 12:43:41] (step=0607400) Train Loss: 0.1443, Train Steps/Sec: 11.73, Grad Norm: 0.0055 +[2025-02-21 12:43:49] (step=0607500) Train Loss: 0.1456, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 12:43:58] (step=0607600) Train Loss: 0.1467, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 12:44:06] (step=0607700) Train Loss: 0.1449, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 12:44:15] (step=0607800) Train Loss: 0.1448, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 12:44:24] (step=0607900) Train Loss: 0.1461, Train Steps/Sec: 10.64, Grad Norm: 0.0063 +[2025-02-21 12:44:34] (step=0608000) Train Loss: 0.1458, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-21 12:44:42] (step=0608100) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 12:44:50] (step=0608200) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 12:44:58] (step=0608300) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:45:06] (step=0608400) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 12:45:14] (step=0608500) Train Loss: 0.1459, Train Steps/Sec: 12.52, Grad Norm: 0.0067 +[2025-02-21 12:45:22] (step=0608600) Train Loss: 0.1461, Train Steps/Sec: 12.08, Grad Norm: 0.0059 +[2025-02-21 12:45:30] (step=0608700) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:45:38] (step=0608800) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 12:45:46] (step=0608900) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:45:54] (step=0609000) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 12:46:02] (step=0609100) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 12:46:10] (step=0609200) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 12:46:19] (step=0609300) Train Loss: 0.1449, Train Steps/Sec: 10.93, Grad Norm: 0.0067 +[2025-02-21 12:46:29] (step=0609400) Train Loss: 0.1461, Train Steps/Sec: 10.31, Grad Norm: 0.0059 +[2025-02-21 12:46:38] (step=0609500) Train Loss: 0.1456, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 12:46:47] (step=0609600) Train Loss: 0.1456, Train Steps/Sec: 10.67, Grad Norm: 0.0058 +[2025-02-21 12:46:55] (step=0609700) Train Loss: 0.1453, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 12:47:03] (step=0609800) Train Loss: 0.1457, Train Steps/Sec: 12.23, Grad Norm: 0.0060 +[2025-02-21 12:47:12] (step=0609900) Train Loss: 0.1450, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 12:47:21] (step=0610000) Train Loss: 0.1455, Train Steps/Sec: 10.71, Grad Norm: 0.0064 +[2025-02-21 12:47:30] (step=0610100) Train Loss: 0.1452, Train Steps/Sec: 11.03, Grad Norm: 0.0061 +[2025-02-21 12:47:39] (step=0610200) Train Loss: 0.1465, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 12:47:47] (step=0610300) Train Loss: 0.1470, Train Steps/Sec: 12.65, Grad Norm: 0.0057 +[2025-02-21 12:47:55] (step=0610400) Train Loss: 0.1452, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 12:48:04] (step=0610500) Train Loss: 0.1463, Train Steps/Sec: 11.26, Grad Norm: 0.0058 +[2025-02-21 12:48:12] (step=0610600) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 12:48:20] (step=0610700) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 12:48:28] (step=0610800) Train Loss: 0.1468, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 12:48:36] (step=0610900) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 12:48:44] (step=0611000) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:48:52] (step=0611100) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 12:48:59] (step=0611200) Train Loss: 0.1450, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 12:49:07] (step=0611300) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:49:16] (step=0611400) Train Loss: 0.1451, Train Steps/Sec: 12.11, Grad Norm: 0.0063 +[2025-02-21 12:49:24] (step=0611500) Train Loss: 0.1472, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 12:49:34] (step=0611600) Train Loss: 0.1454, Train Steps/Sec: 10.28, Grad Norm: 0.0057 +[2025-02-21 12:49:43] (step=0611700) Train Loss: 0.1454, Train Steps/Sec: 11.37, Grad Norm: 0.0057 +[2025-02-21 12:49:53] (step=0611800) Train Loss: 0.1463, Train Steps/Sec: 9.93, Grad Norm: 0.0062 +[2025-02-21 12:50:01] (step=0611900) Train Loss: 0.1467, Train Steps/Sec: 12.51, Grad Norm: 0.0064 +[2025-02-21 12:50:09] (step=0612000) Train Loss: 0.1462, Train Steps/Sec: 12.03, Grad Norm: 0.0060 +[2025-02-21 12:50:18] (step=0612100) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0057 +[2025-02-21 12:50:27] (step=0612200) Train Loss: 0.1476, Train Steps/Sec: 10.40, Grad Norm: 0.0059 +[2025-02-21 12:50:36] (step=0612300) Train Loss: 0.1460, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 12:50:45] (step=0612400) Train Loss: 0.1460, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 12:50:52] (step=0612500) Train Loss: 0.1462, Train Steps/Sec: 12.64, Grad Norm: 0.0065 +[2025-02-21 12:51:01] (step=0612600) Train Loss: 0.1466, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 12:51:09] (step=0612700) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 12:51:16] (step=0612800) Train Loss: 0.1472, Train Steps/Sec: 12.64, Grad Norm: 0.0057 +[2025-02-21 12:51:24] (step=0612900) Train Loss: 0.1461, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 12:51:34] (step=0613000) Train Loss: 0.1474, Train Steps/Sec: 10.92, Grad Norm: 0.0060 +[2025-02-21 12:51:42] (step=0613100) Train Loss: 0.1472, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 12:51:49] (step=0613200) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 12:51:57] (step=0613300) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 12:52:05] (step=0613400) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 12:52:14] (step=0613500) Train Loss: 0.1461, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 12:52:21] (step=0613600) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 12:52:31] (step=0613700) Train Loss: 0.1462, Train Steps/Sec: 10.71, Grad Norm: 0.0062 +[2025-02-21 12:52:40] (step=0613800) Train Loss: 0.1456, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 12:52:49] (step=0613900) Train Loss: 0.1456, Train Steps/Sec: 10.65, Grad Norm: 0.0060 +[2025-02-21 12:52:58] (step=0614000) Train Loss: 0.1456, Train Steps/Sec: 11.67, Grad Norm: 0.0058 +[2025-02-21 12:53:06] (step=0614100) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 12:53:14] (step=0614200) Train Loss: 0.1456, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 12:53:24] (step=0614300) Train Loss: 0.1456, Train Steps/Sec: 9.91, Grad Norm: 0.0061 +[2025-02-21 12:53:33] (step=0614400) Train Loss: 0.1461, Train Steps/Sec: 10.66, Grad Norm: 0.0061 +[2025-02-21 12:53:42] (step=0614500) Train Loss: 0.1463, Train Steps/Sec: 11.33, Grad Norm: 0.0062 +[2025-02-21 12:53:50] (step=0614600) Train Loss: 0.1466, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 12:53:58] (step=0614700) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 12:54:06] (step=0614800) Train Loss: 0.1458, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 12:54:14] (step=0614900) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 12:54:22] (step=0615000) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 12:54:31] (step=0615100) Train Loss: 0.1474, Train Steps/Sec: 12.10, Grad Norm: 0.0059 +[2025-02-21 12:54:39] (step=0615200) Train Loss: 0.1459, Train Steps/Sec: 12.54, Grad Norm: 0.0057 +[2025-02-21 12:54:47] (step=0615300) Train Loss: 0.1455, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 12:54:54] (step=0615400) Train Loss: 0.1469, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 12:55:03] (step=0615500) Train Loss: 0.1448, Train Steps/Sec: 11.16, Grad Norm: 0.0058 +[2025-02-21 12:55:11] (step=0615600) Train Loss: 0.1466, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 12:55:20] (step=0615700) Train Loss: 0.1457, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 12:55:28] (step=0615800) Train Loss: 0.1463, Train Steps/Sec: 11.72, Grad Norm: 0.0057 +[2025-02-21 12:55:37] (step=0615900) Train Loss: 0.1470, Train Steps/Sec: 10.80, Grad Norm: 0.0060 +[2025-02-21 12:55:47] (step=0616000) Train Loss: 0.1472, Train Steps/Sec: 10.97, Grad Norm: 0.0064 +[2025-02-21 12:55:55] (step=0616100) Train Loss: 0.1470, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 12:56:04] (step=0616200) Train Loss: 0.1470, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 12:56:12] (step=0616300) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 12:56:20] (step=0616400) Train Loss: 0.1443, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 12:56:29] (step=0616500) Train Loss: 0.1458, Train Steps/Sec: 10.99, Grad Norm: 0.0057 +[2025-02-21 12:56:39] (step=0616600) Train Loss: 0.1458, Train Steps/Sec: 10.32, Grad Norm: 0.0061 +[2025-02-21 12:56:48] (step=0616700) Train Loss: 0.1460, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 12:56:57] (step=0616800) Train Loss: 0.1474, Train Steps/Sec: 11.15, Grad Norm: 0.0059 +[2025-02-21 12:57:05] (step=0616900) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 12:57:13] (step=0617000) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 12:57:21] (step=0617100) Train Loss: 0.1454, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 12:57:29] (step=0617200) Train Loss: 0.1466, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 12:57:37] (step=0617300) Train Loss: 0.1471, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 12:57:45] (step=0617400) Train Loss: 0.1460, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 12:57:53] (step=0617500) Train Loss: 0.1460, Train Steps/Sec: 12.65, Grad Norm: 0.0057 +[2025-02-21 12:58:01] (step=0617600) Train Loss: 0.1465, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 12:58:09] (step=0617700) Train Loss: 0.1453, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 12:58:17] (step=0617800) Train Loss: 0.1459, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 12:58:25] (step=0617900) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 12:58:35] (step=0618000) Train Loss: 0.1467, Train Steps/Sec: 10.01, Grad Norm: 0.0062 +[2025-02-21 12:58:43] (step=0618100) Train Loss: 0.1459, Train Steps/Sec: 11.66, Grad Norm: 0.0063 +[2025-02-21 12:58:53] (step=0618200) Train Loss: 0.1462, Train Steps/Sec: 10.32, Grad Norm: 0.0057 +[2025-02-21 12:59:01] (step=0618300) Train Loss: 0.1460, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 12:59:10] (step=0618400) Train Loss: 0.1467, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 12:59:18] (step=0618500) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 12:59:26] (step=0618600) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 12:59:35] (step=0618700) Train Loss: 0.1458, Train Steps/Sec: 10.67, Grad Norm: 0.0056 +[2025-02-21 12:59:45] (step=0618800) Train Loss: 0.1449, Train Steps/Sec: 10.35, Grad Norm: 0.0058 +[2025-02-21 12:59:53] (step=0618900) Train Loss: 0.1460, Train Steps/Sec: 11.74, Grad Norm: 0.0056 +[2025-02-21 13:00:01] (step=0619000) Train Loss: 0.1456, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 13:00:09] (step=0619100) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 13:00:17] (step=0619200) Train Loss: 0.1462, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 13:00:26] (step=0619300) Train Loss: 0.1455, Train Steps/Sec: 11.19, Grad Norm: 0.0058 +[2025-02-21 13:00:34] (step=0619400) Train Loss: 0.1480, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 13:00:43] (step=0619500) Train Loss: 0.1466, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 13:00:51] (step=0619600) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 13:00:58] (step=0619700) Train Loss: 0.1450, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:01:06] (step=0619800) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:01:15] (step=0619900) Train Loss: 0.1462, Train Steps/Sec: 12.21, Grad Norm: 0.0060 +[2025-02-21 13:01:22] (step=0620000) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:01:27] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0620000.pt +[2025-02-21 13:01:35] (step=0620100) Train Loss: 0.1460, Train Steps/Sec: 7.74, Grad Norm: 0.0055 +[2025-02-21 13:01:44] (step=0620200) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 13:01:53] (step=0620300) Train Loss: 0.1460, Train Steps/Sec: 11.00, Grad Norm: 0.0058 +[2025-02-21 13:02:02] (step=0620400) Train Loss: 0.1460, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-21 13:02:11] (step=0620500) Train Loss: 0.1467, Train Steps/Sec: 10.89, Grad Norm: 0.0058 +[2025-02-21 13:02:20] (step=0620600) Train Loss: 0.1471, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-21 13:02:28] (step=0620700) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 13:02:36] (step=0620800) Train Loss: 0.1461, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 13:02:46] (step=0620900) Train Loss: 0.1464, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 13:02:55] (step=0621000) Train Loss: 0.1467, Train Steps/Sec: 10.67, Grad Norm: 0.0061 +[2025-02-21 13:03:03] (step=0621100) Train Loss: 0.1470, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 13:03:11] (step=0621200) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 13:03:20] (step=0621300) Train Loss: 0.1454, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 13:03:28] (step=0621400) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 13:03:35] (step=0621500) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:03:43] (step=0621600) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 13:03:52] (step=0621700) Train Loss: 0.1471, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 13:04:01] (step=0621800) Train Loss: 0.1458, Train Steps/Sec: 11.25, Grad Norm: 0.0057 +[2025-02-21 13:04:08] (step=0621900) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 13:04:17] (step=0622000) Train Loss: 0.1448, Train Steps/Sec: 12.21, Grad Norm: 0.0058 +[2025-02-21 13:04:25] (step=0622100) Train Loss: 0.1447, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 13:04:33] (step=0622200) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 13:04:42] (step=0622300) Train Loss: 0.1456, Train Steps/Sec: 11.03, Grad Norm: 0.0057 +[2025-02-21 13:04:50] (step=0622400) Train Loss: 0.1453, Train Steps/Sec: 12.19, Grad Norm: 0.0057 +[2025-02-21 13:04:59] (step=0622500) Train Loss: 0.1468, Train Steps/Sec: 11.01, Grad Norm: 0.0059 +[2025-02-21 13:05:08] (step=0622600) Train Loss: 0.1457, Train Steps/Sec: 11.31, Grad Norm: 0.0057 +[2025-02-21 13:05:16] (step=0622700) Train Loss: 0.1460, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 13:05:25] (step=0622800) Train Loss: 0.1458, Train Steps/Sec: 11.67, Grad Norm: 0.0058 +[2025-02-21 13:05:32] (step=0622900) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 13:05:42] (step=0623000) Train Loss: 0.1459, Train Steps/Sec: 10.55, Grad Norm: 0.0058 +[2025-02-21 13:05:52] (step=0623100) Train Loss: 0.1452, Train Steps/Sec: 10.29, Grad Norm: 0.0061 +[2025-02-21 13:06:01] (step=0623200) Train Loss: 0.1468, Train Steps/Sec: 10.97, Grad Norm: 0.0062 +[2025-02-21 13:06:09] (step=0623300) Train Loss: 0.1467, Train Steps/Sec: 11.76, Grad Norm: 0.0057 +[2025-02-21 13:06:17] (step=0623400) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 13:06:25] (step=0623500) Train Loss: 0.1462, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 13:06:33] (step=0623600) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 13:06:41] (step=0623700) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 13:06:49] (step=0623800) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 13:06:57] (step=0623900) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 13:07:05] (step=0624000) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 13:07:13] (step=0624100) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:07:21] (step=0624200) Train Loss: 0.1465, Train Steps/Sec: 12.21, Grad Norm: 0.0058 +[2025-02-21 13:07:30] (step=0624300) Train Loss: 0.1458, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-21 13:07:39] (step=0624400) Train Loss: 0.1469, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 13:07:47] (step=0624500) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 13:07:56] (step=0624600) Train Loss: 0.1455, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 13:08:05] (step=0624700) Train Loss: 0.1471, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 13:08:14] (step=0624800) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 13:08:22] (step=0624900) Train Loss: 0.1465, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 13:08:30] (step=0625000) Train Loss: 0.1456, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 13:08:38] (step=0625100) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0064 +[2025-02-21 13:08:47] (step=0625200) Train Loss: 0.1454, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 13:08:56] (step=0625300) Train Loss: 0.1450, Train Steps/Sec: 11.04, Grad Norm: 0.0059 +[2025-02-21 13:09:06] (step=0625400) Train Loss: 0.1468, Train Steps/Sec: 10.65, Grad Norm: 0.0062 +[2025-02-21 13:09:14] (step=0625500) Train Loss: 0.1470, Train Steps/Sec: 11.59, Grad Norm: 0.0063 +[2025-02-21 13:09:23] (step=0625600) Train Loss: 0.1460, Train Steps/Sec: 11.79, Grad Norm: 0.0062 +[2025-02-21 13:09:31] (step=0625700) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 13:09:39] (step=0625800) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 13:09:47] (step=0625900) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 13:09:55] (step=0626000) Train Loss: 0.1451, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 13:10:03] (step=0626100) Train Loss: 0.1466, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 13:10:11] (step=0626200) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 13:10:19] (step=0626300) Train Loss: 0.1461, Train Steps/Sec: 12.22, Grad Norm: 0.0062 +[2025-02-21 13:10:27] (step=0626400) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 13:10:36] (step=0626500) Train Loss: 0.1461, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 13:10:44] (step=0626600) Train Loss: 0.1470, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 13:10:53] (step=0626700) Train Loss: 0.1463, Train Steps/Sec: 11.81, Grad Norm: 0.0060 +[2025-02-21 13:11:02] (step=0626800) Train Loss: 0.1458, Train Steps/Sec: 10.54, Grad Norm: 0.0058 +[2025-02-21 13:11:11] (step=0626900) Train Loss: 0.1478, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 13:11:19] (step=0627000) Train Loss: 0.1473, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 13:11:27] (step=0627100) Train Loss: 0.1455, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 13:11:36] (step=0627200) Train Loss: 0.1468, Train Steps/Sec: 11.73, Grad Norm: 0.0057 +[2025-02-21 13:11:44] (step=0627300) Train Loss: 0.1453, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 13:11:53] (step=0627400) Train Loss: 0.1467, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 13:12:02] (step=0627500) Train Loss: 0.1464, Train Steps/Sec: 10.70, Grad Norm: 0.0059 +[2025-02-21 13:12:11] (step=0627600) Train Loss: 0.1464, Train Steps/Sec: 11.02, Grad Norm: 0.0060 +[2025-02-21 13:12:20] (step=0627700) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 13:12:28] (step=0627800) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:12:36] (step=0627900) Train Loss: 0.1461, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 13:12:44] (step=0628000) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 13:12:53] (step=0628100) Train Loss: 0.1468, Train Steps/Sec: 11.27, Grad Norm: 0.0056 +[2025-02-21 13:13:01] (step=0628200) Train Loss: 0.1453, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 13:13:09] (step=0628300) Train Loss: 0.1464, Train Steps/Sec: 12.51, Grad Norm: 0.0069 +[2025-02-21 13:13:17] (step=0628400) Train Loss: 0.1465, Train Steps/Sec: 12.50, Grad Norm: 0.0058 +[2025-02-21 13:13:25] (step=0628500) Train Loss: 0.1458, Train Steps/Sec: 12.09, Grad Norm: 0.0063 +[2025-02-21 13:13:33] (step=0628600) Train Loss: 0.1464, Train Steps/Sec: 12.05, Grad Norm: 0.0059 +[2025-02-21 13:13:42] (step=0628700) Train Loss: 0.1477, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 13:13:51] (step=0628800) Train Loss: 0.1464, Train Steps/Sec: 12.11, Grad Norm: 0.0058 +[2025-02-21 13:13:59] (step=0628900) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0057 +[2025-02-21 13:14:07] (step=0629000) Train Loss: 0.1467, Train Steps/Sec: 12.06, Grad Norm: 0.0063 +[2025-02-21 13:14:16] (step=0629100) Train Loss: 0.1462, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-21 13:14:25] (step=0629200) Train Loss: 0.1460, Train Steps/Sec: 12.02, Grad Norm: 0.0058 +[2025-02-21 13:14:34] (step=0629300) Train Loss: 0.1466, Train Steps/Sec: 10.53, Grad Norm: 0.0058 +[2025-02-21 13:14:42] (step=0629400) Train Loss: 0.1462, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 13:14:51] (step=0629500) Train Loss: 0.1472, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 13:15:00] (step=0629600) Train Loss: 0.1467, Train Steps/Sec: 10.62, Grad Norm: 0.0059 +[2025-02-21 13:15:09] (step=0629700) Train Loss: 0.1475, Train Steps/Sec: 11.02, Grad Norm: 0.0060 +[2025-02-21 13:15:18] (step=0629800) Train Loss: 0.1463, Train Steps/Sec: 11.00, Grad Norm: 0.0063 +[2025-02-21 13:15:26] (step=0629900) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:15:34] (step=0630000) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 13:15:42] (step=0630100) Train Loss: 0.1459, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 13:15:50] (step=0630200) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:15:58] (step=0630300) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 13:16:06] (step=0630400) Train Loss: 0.1454, Train Steps/Sec: 12.10, Grad Norm: 0.0063 +[2025-02-21 13:16:14] (step=0630500) Train Loss: 0.1460, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 13:16:24] (step=0630600) Train Loss: 0.1463, Train Steps/Sec: 10.87, Grad Norm: 0.0062 +[2025-02-21 13:16:31] (step=0630700) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 13:16:41] (step=0630800) Train Loss: 0.1469, Train Steps/Sec: 11.02, Grad Norm: 0.0064 +[2025-02-21 13:16:48] (step=0630900) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 13:16:57] (step=0631000) Train Loss: 0.1453, Train Steps/Sec: 11.42, Grad Norm: 0.0062 +[2025-02-21 13:17:05] (step=0631100) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:17:13] (step=0631200) Train Loss: 0.1467, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 13:17:22] (step=0631300) Train Loss: 0.1464, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 13:17:30] (step=0631400) Train Loss: 0.1469, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 13:17:39] (step=0631500) Train Loss: 0.1450, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 13:17:47] (step=0631600) Train Loss: 0.1460, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 13:17:55] (step=0631700) Train Loss: 0.1473, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 13:18:06] (step=0631800) Train Loss: 0.1467, Train Steps/Sec: 9.58, Grad Norm: 0.0062 +[2025-02-21 13:18:15] (step=0631900) Train Loss: 0.1461, Train Steps/Sec: 10.60, Grad Norm: 0.0062 +[2025-02-21 13:18:24] (step=0632000) Train Loss: 0.1468, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 13:18:32] (step=0632100) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 13:18:40] (step=0632200) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 13:18:48] (step=0632300) Train Loss: 0.1466, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 13:18:56] (step=0632400) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 13:19:04] (step=0632500) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 13:19:12] (step=0632600) Train Loss: 0.1463, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 13:19:21] (step=0632700) Train Loss: 0.1466, Train Steps/Sec: 12.17, Grad Norm: 0.0064 +[2025-02-21 13:19:29] (step=0632800) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 13:19:37] (step=0632900) Train Loss: 0.1451, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 13:19:45] (step=0633000) Train Loss: 0.1470, Train Steps/Sec: 12.07, Grad Norm: 0.0063 +[2025-02-21 13:19:55] (step=0633100) Train Loss: 0.1460, Train Steps/Sec: 10.26, Grad Norm: 0.0061 +[2025-02-21 13:20:03] (step=0633200) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 13:20:11] (step=0633300) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 13:20:20] (step=0633400) Train Loss: 0.1460, Train Steps/Sec: 11.73, Grad Norm: 0.0064 +[2025-02-21 13:20:28] (step=0633500) Train Loss: 0.1479, Train Steps/Sec: 11.73, Grad Norm: 0.0065 +[2025-02-21 13:20:36] (step=0633600) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 13:20:45] (step=0633700) Train Loss: 0.1473, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 13:20:53] (step=0633800) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:21:01] (step=0633900) Train Loss: 0.1455, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 13:21:11] (step=0634000) Train Loss: 0.1462, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 13:21:20] (step=0634100) Train Loss: 0.1459, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 13:21:29] (step=0634200) Train Loss: 0.1475, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 13:21:38] (step=0634300) Train Loss: 0.1457, Train Steps/Sec: 11.23, Grad Norm: 0.0063 +[2025-02-21 13:21:46] (step=0634400) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 13:21:54] (step=0634500) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:22:02] (step=0634600) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 13:22:10] (step=0634700) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 13:22:18] (step=0634800) Train Loss: 0.1463, Train Steps/Sec: 12.13, Grad Norm: 0.0064 +[2025-02-21 13:22:26] (step=0634900) Train Loss: 0.1458, Train Steps/Sec: 12.21, Grad Norm: 0.0062 +[2025-02-21 13:22:34] (step=0635000) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 13:22:43] (step=0635100) Train Loss: 0.1464, Train Steps/Sec: 11.38, Grad Norm: 0.0058 +[2025-02-21 13:22:51] (step=0635200) Train Loss: 0.1476, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 13:23:00] (step=0635300) Train Loss: 0.1466, Train Steps/Sec: 11.77, Grad Norm: 0.0064 +[2025-02-21 13:23:08] (step=0635400) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 13:23:16] (step=0635500) Train Loss: 0.1455, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 13:23:25] (step=0635600) Train Loss: 0.1462, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-21 13:23:34] (step=0635700) Train Loss: 0.1458, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 13:23:42] (step=0635800) Train Loss: 0.1464, Train Steps/Sec: 11.70, Grad Norm: 0.0058 +[2025-02-21 13:23:51] (step=0635900) Train Loss: 0.1474, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 13:23:59] (step=0636000) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 13:24:07] (step=0636100) Train Loss: 0.1464, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 13:24:16] (step=0636200) Train Loss: 0.1462, Train Steps/Sec: 10.69, Grad Norm: 0.0059 +[2025-02-21 13:24:26] (step=0636300) Train Loss: 0.1453, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 13:24:34] (step=0636400) Train Loss: 0.1454, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 13:24:42] (step=0636500) Train Loss: 0.1456, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 13:24:50] (step=0636600) Train Loss: 0.1455, Train Steps/Sec: 12.16, Grad Norm: 0.0065 +[2025-02-21 13:24:58] (step=0636700) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 13:25:07] (step=0636800) Train Loss: 0.1464, Train Steps/Sec: 11.26, Grad Norm: 0.0066 +[2025-02-21 13:25:16] (step=0636900) Train Loss: 0.1453, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 13:25:24] (step=0637000) Train Loss: 0.1457, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 13:25:32] (step=0637100) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:25:41] (step=0637200) Train Loss: 0.1452, Train Steps/Sec: 11.03, Grad Norm: 0.0059 +[2025-02-21 13:25:49] (step=0637300) Train Loss: 0.1452, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 13:25:57] (step=0637400) Train Loss: 0.1460, Train Steps/Sec: 11.43, Grad Norm: 0.0061 +[2025-02-21 13:26:05] (step=0637500) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 13:26:13] (step=0637600) Train Loss: 0.1449, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 13:26:21] (step=0637700) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:26:30] (step=0637800) Train Loss: 0.1451, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 13:26:38] (step=0637900) Train Loss: 0.1459, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 13:26:47] (step=0638000) Train Loss: 0.1459, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 13:26:56] (step=0638100) Train Loss: 0.1465, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-21 13:27:04] (step=0638200) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 13:27:13] (step=0638300) Train Loss: 0.1468, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 13:27:22] (step=0638400) Train Loss: 0.1458, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 13:27:31] (step=0638500) Train Loss: 0.1462, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 13:27:40] (step=0638600) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 13:27:48] (step=0638700) Train Loss: 0.1444, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 13:27:56] (step=0638800) Train Loss: 0.1455, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 13:28:04] (step=0638900) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 13:28:12] (step=0639000) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 13:28:20] (step=0639100) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 13:28:28] (step=0639200) Train Loss: 0.1452, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 13:28:39] (step=0639300) Train Loss: 0.1454, Train Steps/Sec: 9.90, Grad Norm: 0.0062 +[2025-02-21 13:28:47] (step=0639400) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 13:28:55] (step=0639500) Train Loss: 0.1461, Train Steps/Sec: 11.78, Grad Norm: 0.0061 +[2025-02-21 13:29:03] (step=0639600) Train Loss: 0.1455, Train Steps/Sec: 12.16, Grad Norm: 0.0056 +[2025-02-21 13:29:11] (step=0639700) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 13:29:19] (step=0639800) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 13:29:27] (step=0639900) Train Loss: 0.1473, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 13:29:36] (step=0640000) Train Loss: 0.1457, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 13:29:39] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0640000.pt +[2025-02-21 13:29:47] (step=0640100) Train Loss: 0.1453, Train Steps/Sec: 8.59, Grad Norm: 0.0058 +[2025-02-21 13:29:56] (step=0640200) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 13:30:04] (step=0640300) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 13:30:12] (step=0640400) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 13:30:21] (step=0640500) Train Loss: 0.1456, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 13:30:32] (step=0640600) Train Loss: 0.1458, Train Steps/Sec: 9.77, Grad Norm: 0.0060 +[2025-02-21 13:30:40] (step=0640700) Train Loss: 0.1458, Train Steps/Sec: 11.36, Grad Norm: 0.0057 +[2025-02-21 13:30:49] (step=0640800) Train Loss: 0.1454, Train Steps/Sec: 11.73, Grad Norm: 0.0056 +[2025-02-21 13:30:57] (step=0640900) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 13:31:05] (step=0641000) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 13:31:13] (step=0641100) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 13:31:21] (step=0641200) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 13:31:29] (step=0641300) Train Loss: 0.1463, Train Steps/Sec: 11.77, Grad Norm: 0.0057 +[2025-02-21 13:31:37] (step=0641400) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 13:31:46] (step=0641500) Train Loss: 0.1454, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 13:31:55] (step=0641600) Train Loss: 0.1453, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 13:32:03] (step=0641700) Train Loss: 0.1457, Train Steps/Sec: 12.14, Grad Norm: 0.0056 +[2025-02-21 13:32:12] (step=0641800) Train Loss: 0.1472, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 13:32:20] (step=0641900) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 13:32:28] (step=0642000) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 13:32:36] (step=0642100) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 13:32:44] (step=0642200) Train Loss: 0.1462, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 13:32:53] (step=0642300) Train Loss: 0.1461, Train Steps/Sec: 11.68, Grad Norm: 0.0058 +[2025-02-21 13:33:02] (step=0642400) Train Loss: 0.1461, Train Steps/Sec: 11.72, Grad Norm: 0.0065 +[2025-02-21 13:33:10] (step=0642500) Train Loss: 0.1458, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 13:33:18] (step=0642600) Train Loss: 0.1448, Train Steps/Sec: 12.16, Grad Norm: 0.0057 +[2025-02-21 13:33:27] (step=0642700) Train Loss: 0.1462, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 13:33:36] (step=0642800) Train Loss: 0.1465, Train Steps/Sec: 10.68, Grad Norm: 0.0061 +[2025-02-21 13:33:46] (step=0642900) Train Loss: 0.1465, Train Steps/Sec: 11.00, Grad Norm: 0.0058 +[2025-02-21 13:33:54] (step=0643000) Train Loss: 0.1455, Train Steps/Sec: 12.09, Grad Norm: 0.0058 +[2025-02-21 13:34:03] (step=0643100) Train Loss: 0.1448, Train Steps/Sec: 11.28, Grad Norm: 0.0057 +[2025-02-21 13:34:11] (step=0643200) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 13:34:19] (step=0643300) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 13:34:27] (step=0643400) Train Loss: 0.1477, Train Steps/Sec: 12.21, Grad Norm: 0.0061 +[2025-02-21 13:34:35] (step=0643500) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 13:34:44] (step=0643600) Train Loss: 0.1460, Train Steps/Sec: 11.41, Grad Norm: 0.0061 +[2025-02-21 13:34:52] (step=0643700) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 13:35:01] (step=0643800) Train Loss: 0.1460, Train Steps/Sec: 11.43, Grad Norm: 0.0060 +[2025-02-21 13:35:09] (step=0643900) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 13:35:17] (step=0644000) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 13:35:25] (step=0644100) Train Loss: 0.1477, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 13:35:32] (step=0644200) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 13:35:41] (step=0644300) Train Loss: 0.1462, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 13:35:50] (step=0644400) Train Loss: 0.1454, Train Steps/Sec: 11.71, Grad Norm: 0.0057 +[2025-02-21 13:35:59] (step=0644500) Train Loss: 0.1449, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-21 13:36:07] (step=0644600) Train Loss: 0.1453, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 13:36:16] (step=0644700) Train Loss: 0.1457, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 13:36:24] (step=0644800) Train Loss: 0.1458, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 13:36:33] (step=0644900) Train Loss: 0.1462, Train Steps/Sec: 10.67, Grad Norm: 0.0059 +[2025-02-21 13:36:43] (step=0645000) Train Loss: 0.1476, Train Steps/Sec: 10.65, Grad Norm: 0.0060 +[2025-02-21 13:36:51] (step=0645100) Train Loss: 0.1464, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 13:36:59] (step=0645200) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 13:37:07] (step=0645300) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 13:37:15] (step=0645400) Train Loss: 0.1459, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 13:37:23] (step=0645500) Train Loss: 0.1469, Train Steps/Sec: 12.50, Grad Norm: 0.0058 +[2025-02-21 13:37:33] (step=0645600) Train Loss: 0.1461, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 13:37:41] (step=0645700) Train Loss: 0.1455, Train Steps/Sec: 11.33, Grad Norm: 0.0066 +[2025-02-21 13:37:50] (step=0645800) Train Loss: 0.1471, Train Steps/Sec: 11.67, Grad Norm: 0.0058 +[2025-02-21 13:37:58] (step=0645900) Train Loss: 0.1474, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 13:38:07] (step=0646000) Train Loss: 0.1470, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 13:38:15] (step=0646100) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 13:38:23] (step=0646200) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 13:38:31] (step=0646300) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 13:38:39] (step=0646400) Train Loss: 0.1467, Train Steps/Sec: 12.52, Grad Norm: 0.0065 +[2025-02-21 13:38:46] (step=0646500) Train Loss: 0.1472, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 13:38:55] (step=0646600) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 13:39:04] (step=0646700) Train Loss: 0.1466, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-21 13:39:13] (step=0646800) Train Loss: 0.1460, Train Steps/Sec: 10.52, Grad Norm: 0.0061 +[2025-02-21 13:39:22] (step=0646900) Train Loss: 0.1460, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 13:39:30] (step=0647000) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 13:39:39] (step=0647100) Train Loss: 0.1456, Train Steps/Sec: 10.66, Grad Norm: 0.0058 +[2025-02-21 13:39:49] (step=0647200) Train Loss: 0.1456, Train Steps/Sec: 10.65, Grad Norm: 0.0056 +[2025-02-21 13:39:57] (step=0647300) Train Loss: 0.1477, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 13:40:05] (step=0647400) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 13:40:13] (step=0647500) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 13:40:22] (step=0647600) Train Loss: 0.1458, Train Steps/Sec: 12.13, Grad Norm: 0.0064 +[2025-02-21 13:40:30] (step=0647700) Train Loss: 0.1465, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 13:40:38] (step=0647800) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 13:40:47] (step=0647900) Train Loss: 0.1463, Train Steps/Sec: 10.65, Grad Norm: 0.0058 +[2025-02-21 13:40:55] (step=0648000) Train Loss: 0.1448, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 13:41:05] (step=0648100) Train Loss: 0.1456, Train Steps/Sec: 10.37, Grad Norm: 0.0060 +[2025-02-21 13:41:13] (step=0648200) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 13:41:21] (step=0648300) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 13:41:29] (step=0648400) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 13:41:36] (step=0648500) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 13:41:44] (step=0648600) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 13:41:52] (step=0648700) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 13:42:01] (step=0648800) Train Loss: 0.1472, Train Steps/Sec: 11.72, Grad Norm: 0.0057 +[2025-02-21 13:42:10] (step=0648900) Train Loss: 0.1461, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 13:42:18] (step=0649000) Train Loss: 0.1453, Train Steps/Sec: 11.74, Grad Norm: 0.0056 +[2025-02-21 13:42:27] (step=0649100) Train Loss: 0.1455, Train Steps/Sec: 12.10, Grad Norm: 0.0059 +[2025-02-21 13:42:35] (step=0649200) Train Loss: 0.1467, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 13:42:45] (step=0649300) Train Loss: 0.1467, Train Steps/Sec: 9.67, Grad Norm: 0.0068 +[2025-02-21 13:42:55] (step=0649400) Train Loss: 0.1470, Train Steps/Sec: 10.65, Grad Norm: 0.0061 +[2025-02-21 13:43:03] (step=0649500) Train Loss: 0.1457, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 13:43:11] (step=0649600) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 13:43:19] (step=0649700) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 13:43:27] (step=0649800) Train Loss: 0.1461, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 13:43:35] (step=0649900) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:43:44] (step=0650000) Train Loss: 0.1453, Train Steps/Sec: 11.02, Grad Norm: 0.0056 +[2025-02-21 13:43:53] (step=0650100) Train Loss: 0.1468, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 13:44:01] (step=0650200) Train Loss: 0.1465, Train Steps/Sec: 11.81, Grad Norm: 0.0058 +[2025-02-21 13:44:09] (step=0650300) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 13:44:17] (step=0650400) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 13:44:25] (step=0650500) Train Loss: 0.1461, Train Steps/Sec: 12.53, Grad Norm: 0.0058 +[2025-02-21 13:44:34] (step=0650600) Train Loss: 0.1456, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-21 13:44:42] (step=0650700) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 13:44:50] (step=0650800) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 13:44:58] (step=0650900) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 13:45:06] (step=0651000) Train Loss: 0.1459, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 13:45:15] (step=0651100) Train Loss: 0.1466, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-21 13:45:24] (step=0651200) Train Loss: 0.1470, Train Steps/Sec: 11.79, Grad Norm: 0.0066 +[2025-02-21 13:45:32] (step=0651300) Train Loss: 0.1465, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 13:45:41] (step=0651400) Train Loss: 0.1462, Train Steps/Sec: 11.32, Grad Norm: 0.0062 +[2025-02-21 13:45:50] (step=0651500) Train Loss: 0.1450, Train Steps/Sec: 11.01, Grad Norm: 0.0061 +[2025-02-21 13:45:59] (step=0651600) Train Loss: 0.1447, Train Steps/Sec: 10.65, Grad Norm: 0.0064 +[2025-02-21 13:46:08] (step=0651700) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 13:46:16] (step=0651800) Train Loss: 0.1456, Train Steps/Sec: 11.29, Grad Norm: 0.0059 +[2025-02-21 13:46:24] (step=0651900) Train Loss: 0.1476, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 13:46:33] (step=0652000) Train Loss: 0.1464, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 13:46:41] (step=0652100) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 13:46:50] (step=0652200) Train Loss: 0.1469, Train Steps/Sec: 10.67, Grad Norm: 0.0058 +[2025-02-21 13:46:58] (step=0652300) Train Loss: 0.1465, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 13:47:07] (step=0652400) Train Loss: 0.1443, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 13:47:15] (step=0652500) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 13:47:23] (step=0652600) Train Loss: 0.1449, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 13:47:31] (step=0652700) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 13:47:39] (step=0652800) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 13:47:46] (step=0652900) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:47:54] (step=0653000) Train Loss: 0.1473, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 13:48:03] (step=0653100) Train Loss: 0.1470, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-21 13:48:12] (step=0653200) Train Loss: 0.1478, Train Steps/Sec: 11.70, Grad Norm: 0.0058 +[2025-02-21 13:48:21] (step=0653300) Train Loss: 0.1468, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-21 13:48:30] (step=0653400) Train Loss: 0.1457, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 13:48:38] (step=0653500) Train Loss: 0.1454, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 13:48:46] (step=0653600) Train Loss: 0.1470, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 13:48:56] (step=0653700) Train Loss: 0.1470, Train Steps/Sec: 10.72, Grad Norm: 0.0061 +[2025-02-21 13:49:05] (step=0653800) Train Loss: 0.1464, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 13:49:13] (step=0653900) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0057 +[2025-02-21 13:49:21] (step=0654000) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 13:49:29] (step=0654100) Train Loss: 0.1451, Train Steps/Sec: 12.20, Grad Norm: 0.0061 +[2025-02-21 13:49:37] (step=0654200) Train Loss: 0.1459, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 13:49:47] (step=0654300) Train Loss: 0.1456, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-21 13:49:56] (step=0654400) Train Loss: 0.1469, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 13:50:04] (step=0654500) Train Loss: 0.1446, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 13:50:12] (step=0654600) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 13:50:20] (step=0654700) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 13:50:28] (step=0654800) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 13:50:36] (step=0654900) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 13:50:44] (step=0655000) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 13:50:52] (step=0655100) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 13:51:00] (step=0655200) Train Loss: 0.1472, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 13:51:08] (step=0655300) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 13:51:16] (step=0655400) Train Loss: 0.1464, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 13:51:25] (step=0655500) Train Loss: 0.1465, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 13:51:35] (step=0655600) Train Loss: 0.1454, Train Steps/Sec: 10.27, Grad Norm: 0.0061 +[2025-02-21 13:51:43] (step=0655700) Train Loss: 0.1465, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 13:51:52] (step=0655800) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 13:52:01] (step=0655900) Train Loss: 0.1453, Train Steps/Sec: 10.68, Grad Norm: 0.0059 +[2025-02-21 13:52:10] (step=0656000) Train Loss: 0.1463, Train Steps/Sec: 10.92, Grad Norm: 0.0064 +[2025-02-21 13:52:19] (step=0656100) Train Loss: 0.1467, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-21 13:52:27] (step=0656200) Train Loss: 0.1466, Train Steps/Sec: 12.20, Grad Norm: 0.0057 +[2025-02-21 13:52:35] (step=0656300) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 13:52:44] (step=0656400) Train Loss: 0.1458, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 13:52:52] (step=0656500) Train Loss: 0.1453, Train Steps/Sec: 11.81, Grad Norm: 0.0059 +[2025-02-21 13:53:01] (step=0656600) Train Loss: 0.1458, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 13:53:09] (step=0656700) Train Loss: 0.1453, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 13:53:18] (step=0656800) Train Loss: 0.1451, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 13:53:26] (step=0656900) Train Loss: 0.1460, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 13:53:34] (step=0657000) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 13:53:42] (step=0657100) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 13:53:50] (step=0657200) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 13:53:58] (step=0657300) Train Loss: 0.1479, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 13:54:06] (step=0657400) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 13:54:14] (step=0657500) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 13:54:22] (step=0657600) Train Loss: 0.1463, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 13:54:31] (step=0657700) Train Loss: 0.1471, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 13:54:40] (step=0657800) Train Loss: 0.1455, Train Steps/Sec: 11.74, Grad Norm: 0.0056 +[2025-02-21 13:54:48] (step=0657900) Train Loss: 0.1463, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 13:54:57] (step=0658000) Train Loss: 0.1452, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 13:55:07] (step=0658100) Train Loss: 0.1452, Train Steps/Sec: 9.57, Grad Norm: 0.0062 +[2025-02-21 13:55:17] (step=0658200) Train Loss: 0.1470, Train Steps/Sec: 10.50, Grad Norm: 0.0059 +[2025-02-21 13:55:25] (step=0658300) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 13:55:33] (step=0658400) Train Loss: 0.1449, Train Steps/Sec: 12.11, Grad Norm: 0.0058 +[2025-02-21 13:55:42] (step=0658500) Train Loss: 0.1460, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-21 13:55:50] (step=0658600) Train Loss: 0.1460, Train Steps/Sec: 11.83, Grad Norm: 0.0061 +[2025-02-21 13:55:59] (step=0658700) Train Loss: 0.1461, Train Steps/Sec: 11.82, Grad Norm: 0.0058 +[2025-02-21 13:56:07] (step=0658800) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 13:56:15] (step=0658900) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 13:56:23] (step=0659000) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 13:56:31] (step=0659100) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 13:56:39] (step=0659200) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 13:56:48] (step=0659300) Train Loss: 0.1450, Train Steps/Sec: 11.22, Grad Norm: 0.0058 +[2025-02-21 13:56:56] (step=0659400) Train Loss: 0.1450, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 13:57:04] (step=0659500) Train Loss: 0.1442, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 13:57:12] (step=0659600) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 13:57:20] (step=0659700) Train Loss: 0.1447, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 13:57:29] (step=0659800) Train Loss: 0.1459, Train Steps/Sec: 11.73, Grad Norm: 0.0057 +[2025-02-21 13:57:37] (step=0659900) Train Loss: 0.1472, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 13:57:46] (step=0660000) Train Loss: 0.1464, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 13:57:49] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0660000.pt +[2025-02-21 13:57:57] (step=0660100) Train Loss: 0.1450, Train Steps/Sec: 8.81, Grad Norm: 0.0065 +[2025-02-21 13:58:06] (step=0660200) Train Loss: 0.1459, Train Steps/Sec: 11.05, Grad Norm: 0.0063 +[2025-02-21 13:58:15] (step=0660300) Train Loss: 0.1461, Train Steps/Sec: 11.05, Grad Norm: 0.0061 +[2025-02-21 13:58:25] (step=0660400) Train Loss: 0.1470, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-21 13:58:33] (step=0660500) Train Loss: 0.1459, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 13:58:42] (step=0660600) Train Loss: 0.1445, Train Steps/Sec: 11.28, Grad Norm: 0.0059 +[2025-02-21 13:58:51] (step=0660700) Train Loss: 0.1464, Train Steps/Sec: 10.62, Grad Norm: 0.0058 +[2025-02-21 13:58:59] (step=0660800) Train Loss: 0.1475, Train Steps/Sec: 12.18, Grad Norm: 0.0056 +[2025-02-21 13:59:08] (step=0660900) Train Loss: 0.1466, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 13:59:16] (step=0661000) Train Loss: 0.1457, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 13:59:24] (step=0661100) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 13:59:32] (step=0661200) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 13:59:40] (step=0661300) Train Loss: 0.1459, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 13:59:48] (step=0661400) Train Loss: 0.1452, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 13:59:56] (step=0661500) Train Loss: 0.1474, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 14:00:04] (step=0661600) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 14:00:12] (step=0661700) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 14:00:21] (step=0661800) Train Loss: 0.1459, Train Steps/Sec: 11.26, Grad Norm: 0.0059 +[2025-02-21 14:00:29] (step=0661900) Train Loss: 0.1445, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 14:00:38] (step=0662000) Train Loss: 0.1472, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-21 14:00:46] (step=0662100) Train Loss: 0.1472, Train Steps/Sec: 11.71, Grad Norm: 0.0056 +[2025-02-21 14:00:55] (step=0662200) Train Loss: 0.1454, Train Steps/Sec: 11.72, Grad Norm: 0.0057 +[2025-02-21 14:01:03] (step=0662300) Train Loss: 0.1461, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 14:01:13] (step=0662400) Train Loss: 0.1454, Train Steps/Sec: 10.42, Grad Norm: 0.0056 +[2025-02-21 14:01:22] (step=0662500) Train Loss: 0.1466, Train Steps/Sec: 10.65, Grad Norm: 0.0057 +[2025-02-21 14:01:31] (step=0662600) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 14:01:39] (step=0662700) Train Loss: 0.1465, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 14:01:47] (step=0662800) Train Loss: 0.1450, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 14:01:56] (step=0662900) Train Loss: 0.1466, Train Steps/Sec: 11.40, Grad Norm: 0.0066 +[2025-02-21 14:02:04] (step=0663000) Train Loss: 0.1458, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 14:02:14] (step=0663100) Train Loss: 0.1470, Train Steps/Sec: 10.54, Grad Norm: 0.0059 +[2025-02-21 14:02:22] (step=0663200) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 14:02:30] (step=0663300) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:02:38] (step=0663400) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 14:02:46] (step=0663500) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:02:54] (step=0663600) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:03:01] (step=0663700) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:03:09] (step=0663800) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 14:03:17] (step=0663900) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 14:03:25] (step=0664000) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 14:03:34] (step=0664100) Train Loss: 0.1463, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 14:03:42] (step=0664200) Train Loss: 0.1454, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 14:03:52] (step=0664300) Train Loss: 0.1454, Train Steps/Sec: 10.54, Grad Norm: 0.0059 +[2025-02-21 14:04:00] (step=0664400) Train Loss: 0.1450, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 14:04:09] (step=0664500) Train Loss: 0.1451, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 14:04:18] (step=0664600) Train Loss: 0.1465, Train Steps/Sec: 10.37, Grad Norm: 0.0060 +[2025-02-21 14:04:28] (step=0664700) Train Loss: 0.1466, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 14:04:37] (step=0664800) Train Loss: 0.1456, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 14:04:44] (step=0664900) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 14:04:53] (step=0665000) Train Loss: 0.1463, Train Steps/Sec: 11.42, Grad Norm: 0.0057 +[2025-02-21 14:05:02] (step=0665100) Train Loss: 0.1460, Train Steps/Sec: 11.79, Grad Norm: 0.0059 +[2025-02-21 14:05:10] (step=0665200) Train Loss: 0.1463, Train Steps/Sec: 11.79, Grad Norm: 0.0063 +[2025-02-21 14:05:18] (step=0665300) Train Loss: 0.1452, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 14:05:26] (step=0665400) Train Loss: 0.1459, Train Steps/Sec: 12.65, Grad Norm: 0.0058 +[2025-02-21 14:05:34] (step=0665500) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 14:05:43] (step=0665600) Train Loss: 0.1448, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-21 14:05:51] (step=0665700) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:05:59] (step=0665800) Train Loss: 0.1445, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:06:07] (step=0665900) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:06:15] (step=0666000) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:06:23] (step=0666100) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:06:31] (step=0666200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:06:39] (step=0666300) Train Loss: 0.1452, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 14:06:48] (step=0666400) Train Loss: 0.1454, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 14:06:56] (step=0666500) Train Loss: 0.1455, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 14:07:05] (step=0666600) Train Loss: 0.1452, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 14:07:14] (step=0666700) Train Loss: 0.1456, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 14:07:24] (step=0666800) Train Loss: 0.1466, Train Steps/Sec: 9.74, Grad Norm: 0.0058 +[2025-02-21 14:07:33] (step=0666900) Train Loss: 0.1456, Train Steps/Sec: 10.61, Grad Norm: 0.0057 +[2025-02-21 14:07:42] (step=0667000) Train Loss: 0.1457, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 14:07:51] (step=0667100) Train Loss: 0.1472, Train Steps/Sec: 11.41, Grad Norm: 0.0058 +[2025-02-21 14:07:58] (step=0667200) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 14:08:07] (step=0667300) Train Loss: 0.1472, Train Steps/Sec: 11.41, Grad Norm: 0.0057 +[2025-02-21 14:08:15] (step=0667400) Train Loss: 0.1466, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 14:08:24] (step=0667500) Train Loss: 0.1453, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 14:08:32] (step=0667600) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 14:08:40] (step=0667700) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 14:08:47] (step=0667800) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 14:08:55] (step=0667900) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 14:09:03] (step=0668000) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 14:09:12] (step=0668100) Train Loss: 0.1451, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 14:09:20] (step=0668200) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:09:28] (step=0668300) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:09:36] (step=0668400) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 14:09:44] (step=0668500) Train Loss: 0.1452, Train Steps/Sec: 11.75, Grad Norm: 0.0063 +[2025-02-21 14:09:53] (step=0668600) Train Loss: 0.1452, Train Steps/Sec: 11.32, Grad Norm: 0.0057 +[2025-02-21 14:10:01] (step=0668700) Train Loss: 0.1470, Train Steps/Sec: 12.20, Grad Norm: 0.0061 +[2025-02-21 14:10:10] (step=0668800) Train Loss: 0.1469, Train Steps/Sec: 11.35, Grad Norm: 0.0056 +[2025-02-21 14:10:19] (step=0668900) Train Loss: 0.1470, Train Steps/Sec: 11.82, Grad Norm: 0.0060 +[2025-02-21 14:10:28] (step=0669000) Train Loss: 0.1467, Train Steps/Sec: 10.75, Grad Norm: 0.0059 +[2025-02-21 14:10:37] (step=0669100) Train Loss: 0.1452, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 14:10:46] (step=0669200) Train Loss: 0.1460, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 14:10:56] (step=0669300) Train Loss: 0.1457, Train Steps/Sec: 10.61, Grad Norm: 0.0058 +[2025-02-21 14:11:04] (step=0669400) Train Loss: 0.1446, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 14:11:13] (step=0669500) Train Loss: 0.1470, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 14:11:21] (step=0669600) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 14:11:29] (step=0669700) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 14:11:37] (step=0669800) Train Loss: 0.1479, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 14:11:45] (step=0669900) Train Loss: 0.1449, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:11:53] (step=0670000) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 14:12:01] (step=0670100) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 14:12:09] (step=0670200) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 14:12:16] (step=0670300) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 14:12:24] (step=0670400) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:12:32] (step=0670500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:12:41] (step=0670600) Train Loss: 0.1463, Train Steps/Sec: 11.19, Grad Norm: 0.0059 +[2025-02-21 14:12:50] (step=0670700) Train Loss: 0.1459, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 14:12:59] (step=0670800) Train Loss: 0.1460, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 14:13:07] (step=0670900) Train Loss: 0.1450, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 14:13:16] (step=0671000) Train Loss: 0.1455, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 14:13:24] (step=0671100) Train Loss: 0.1455, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 14:13:34] (step=0671200) Train Loss: 0.1451, Train Steps/Sec: 10.75, Grad Norm: 0.0059 +[2025-02-21 14:13:43] (step=0671300) Train Loss: 0.1454, Train Steps/Sec: 10.62, Grad Norm: 0.0056 +[2025-02-21 14:13:52] (step=0671400) Train Loss: 0.1477, Train Steps/Sec: 11.06, Grad Norm: 0.0058 +[2025-02-21 14:14:01] (step=0671500) Train Loss: 0.1457, Train Steps/Sec: 11.80, Grad Norm: 0.0061 +[2025-02-21 14:14:09] (step=0671600) Train Loss: 0.1458, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 14:14:17] (step=0671700) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:14:26] (step=0671800) Train Loss: 0.1458, Train Steps/Sec: 10.89, Grad Norm: 0.0064 +[2025-02-21 14:14:34] (step=0671900) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 14:14:42] (step=0672000) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 14:14:50] (step=0672100) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 14:14:58] (step=0672200) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:15:06] (step=0672300) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:15:14] (step=0672400) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:15:22] (step=0672500) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 14:15:30] (step=0672600) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:15:38] (step=0672700) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 14:15:46] (step=0672800) Train Loss: 0.1461, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 14:15:54] (step=0672900) Train Loss: 0.1464, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 14:16:03] (step=0673000) Train Loss: 0.1462, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 14:16:12] (step=0673100) Train Loss: 0.1470, Train Steps/Sec: 10.56, Grad Norm: 0.0061 +[2025-02-21 14:16:21] (step=0673200) Train Loss: 0.1463, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 14:16:30] (step=0673300) Train Loss: 0.1467, Train Steps/Sec: 11.03, Grad Norm: 0.0060 +[2025-02-21 14:16:40] (step=0673400) Train Loss: 0.1461, Train Steps/Sec: 10.36, Grad Norm: 0.0059 +[2025-02-21 14:16:49] (step=0673500) Train Loss: 0.1472, Train Steps/Sec: 10.35, Grad Norm: 0.0058 +[2025-02-21 14:16:58] (step=0673600) Train Loss: 0.1458, Train Steps/Sec: 11.75, Grad Norm: 0.0063 +[2025-02-21 14:17:07] (step=0673700) Train Loss: 0.1462, Train Steps/Sec: 11.44, Grad Norm: 0.0060 +[2025-02-21 14:17:15] (step=0673800) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 14:17:23] (step=0673900) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:17:31] (step=0674000) Train Loss: 0.1446, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 14:17:39] (step=0674100) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:17:47] (step=0674200) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:17:56] (step=0674300) Train Loss: 0.1458, Train Steps/Sec: 11.25, Grad Norm: 0.0067 +[2025-02-21 14:18:04] (step=0674400) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 14:18:12] (step=0674500) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 14:18:20] (step=0674600) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 14:18:28] (step=0674700) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:18:36] (step=0674800) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:18:43] (step=0674900) Train Loss: 0.1446, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 14:18:51] (step=0675000) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 14:19:00] (step=0675100) Train Loss: 0.1443, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 14:19:09] (step=0675200) Train Loss: 0.1467, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 14:19:17] (step=0675300) Train Loss: 0.1457, Train Steps/Sec: 11.74, Grad Norm: 0.0064 +[2025-02-21 14:19:26] (step=0675400) Train Loss: 0.1453, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 14:19:35] (step=0675500) Train Loss: 0.1478, Train Steps/Sec: 10.72, Grad Norm: 0.0059 +[2025-02-21 14:19:46] (step=0675600) Train Loss: 0.1458, Train Steps/Sec: 9.15, Grad Norm: 0.0065 +[2025-02-21 14:19:55] (step=0675700) Train Loss: 0.1452, Train Steps/Sec: 10.65, Grad Norm: 0.0059 +[2025-02-21 14:20:04] (step=0675800) Train Loss: 0.1458, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 14:20:12] (step=0675900) Train Loss: 0.1455, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 14:20:21] (step=0676000) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 14:20:29] (step=0676100) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 14:20:37] (step=0676200) Train Loss: 0.1454, Train Steps/Sec: 12.18, Grad Norm: 0.0064 +[2025-02-21 14:20:45] (step=0676300) Train Loss: 0.1452, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 14:20:53] (step=0676400) Train Loss: 0.1448, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 14:21:01] (step=0676500) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 14:21:08] (step=0676600) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 14:21:16] (step=0676700) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 14:21:25] (step=0676800) Train Loss: 0.1464, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 14:21:33] (step=0676900) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:21:41] (step=0677000) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 14:21:49] (step=0677100) Train Loss: 0.1457, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 14:21:57] (step=0677200) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 14:22:05] (step=0677300) Train Loss: 0.1469, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 14:22:14] (step=0677400) Train Loss: 0.1472, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 14:22:23] (step=0677500) Train Loss: 0.1460, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 14:22:32] (step=0677600) Train Loss: 0.1453, Train Steps/Sec: 11.01, Grad Norm: 0.0058 +[2025-02-21 14:22:41] (step=0677700) Train Loss: 0.1452, Train Steps/Sec: 11.08, Grad Norm: 0.0061 +[2025-02-21 14:22:51] (step=0677800) Train Loss: 0.1455, Train Steps/Sec: 9.47, Grad Norm: 0.0059 +[2025-02-21 14:23:01] (step=0677900) Train Loss: 0.1460, Train Steps/Sec: 11.00, Grad Norm: 0.0063 +[2025-02-21 14:23:09] (step=0678000) Train Loss: 0.1465, Train Steps/Sec: 11.77, Grad Norm: 0.0062 +[2025-02-21 14:23:18] (step=0678100) Train Loss: 0.1460, Train Steps/Sec: 11.22, Grad Norm: 0.0061 +[2025-02-21 14:23:26] (step=0678200) Train Loss: 0.1460, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 14:23:34] (step=0678300) Train Loss: 0.1472, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 14:23:42] (step=0678400) Train Loss: 0.1448, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 14:23:50] (step=0678500) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:23:58] (step=0678600) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 14:24:06] (step=0678700) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:24:14] (step=0678800) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:24:22] (step=0678900) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 14:24:30] (step=0679000) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:24:38] (step=0679100) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 14:24:46] (step=0679200) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:24:55] (step=0679300) Train Loss: 0.1461, Train Steps/Sec: 11.30, Grad Norm: 0.0058 +[2025-02-21 14:25:03] (step=0679400) Train Loss: 0.1462, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 14:25:11] (step=0679500) Train Loss: 0.1468, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 14:25:20] (step=0679600) Train Loss: 0.1471, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 14:25:29] (step=0679700) Train Loss: 0.1455, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 14:25:38] (step=0679800) Train Loss: 0.1469, Train Steps/Sec: 11.00, Grad Norm: 0.0062 +[2025-02-21 14:25:48] (step=0679900) Train Loss: 0.1461, Train Steps/Sec: 10.08, Grad Norm: 0.0059 +[2025-02-21 14:25:57] (step=0680000) Train Loss: 0.1461, Train Steps/Sec: 10.35, Grad Norm: 0.0057 +[2025-02-21 14:26:01] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0680000.pt +[2025-02-21 14:26:10] (step=0680100) Train Loss: 0.1461, Train Steps/Sec: 7.99, Grad Norm: 0.0060 +[2025-02-21 14:26:18] (step=0680200) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 14:26:26] (step=0680300) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:26:34] (step=0680400) Train Loss: 0.1457, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 14:26:42] (step=0680500) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 14:26:51] (step=0680600) Train Loss: 0.1455, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-21 14:26:59] (step=0680700) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:27:07] (step=0680800) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:27:15] (step=0680900) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:27:23] (step=0681000) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 14:27:31] (step=0681100) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 14:27:39] (step=0681200) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:27:47] (step=0681300) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:27:55] (step=0681400) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:28:03] (step=0681500) Train Loss: 0.1449, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 14:28:11] (step=0681600) Train Loss: 0.1447, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 14:28:19] (step=0681700) Train Loss: 0.1482, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 14:28:29] (step=0681800) Train Loss: 0.1462, Train Steps/Sec: 10.22, Grad Norm: 0.0058 +[2025-02-21 14:28:38] (step=0681900) Train Loss: 0.1460, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 14:28:47] (step=0682000) Train Loss: 0.1465, Train Steps/Sec: 10.25, Grad Norm: 0.0060 +[2025-02-21 14:28:57] (step=0682100) Train Loss: 0.1461, Train Steps/Sec: 10.39, Grad Norm: 0.0060 +[2025-02-21 14:29:07] (step=0682200) Train Loss: 0.1449, Train Steps/Sec: 10.04, Grad Norm: 0.0060 +[2025-02-21 14:29:16] (step=0682300) Train Loss: 0.1452, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 14:29:24] (step=0682400) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:29:32] (step=0682500) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 14:29:40] (step=0682600) Train Loss: 0.1451, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 14:29:48] (step=0682700) Train Loss: 0.1456, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 14:29:56] (step=0682800) Train Loss: 0.1454, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 14:30:04] (step=0682900) Train Loss: 0.1470, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 14:30:12] (step=0683000) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:30:21] (step=0683100) Train Loss: 0.1460, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 14:30:29] (step=0683200) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:30:37] (step=0683300) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:30:44] (step=0683400) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 14:30:52] (step=0683500) Train Loss: 0.1476, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 14:31:00] (step=0683600) Train Loss: 0.1457, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 14:31:08] (step=0683700) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 14:31:16] (step=0683800) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 14:31:25] (step=0683900) Train Loss: 0.1455, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 14:31:34] (step=0684000) Train Loss: 0.1467, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 14:31:43] (step=0684100) Train Loss: 0.1461, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 14:31:53] (step=0684200) Train Loss: 0.1449, Train Steps/Sec: 9.77, Grad Norm: 0.0059 +[2025-02-21 14:32:03] (step=0684300) Train Loss: 0.1457, Train Steps/Sec: 9.94, Grad Norm: 0.0057 +[2025-02-21 14:32:13] (step=0684400) Train Loss: 0.1456, Train Steps/Sec: 9.97, Grad Norm: 0.0059 +[2025-02-21 14:32:21] (step=0684500) Train Loss: 0.1457, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 14:32:29] (step=0684600) Train Loss: 0.1452, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 14:32:37] (step=0684700) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:32:46] (step=0684800) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 14:32:54] (step=0684900) Train Loss: 0.1458, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 14:33:02] (step=0685000) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 14:33:10] (step=0685100) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:33:17] (step=0685200) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:33:25] (step=0685300) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 14:33:33] (step=0685400) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:33:41] (step=0685500) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 14:33:50] (step=0685600) Train Loss: 0.1460, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 14:33:58] (step=0685700) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 14:34:06] (step=0685800) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 14:34:14] (step=0685900) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:34:22] (step=0686000) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 14:34:31] (step=0686100) Train Loss: 0.1459, Train Steps/Sec: 11.42, Grad Norm: 0.0060 +[2025-02-21 14:34:40] (step=0686200) Train Loss: 0.1465, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 14:34:49] (step=0686300) Train Loss: 0.1457, Train Steps/Sec: 10.62, Grad Norm: 0.0058 +[2025-02-21 14:34:59] (step=0686400) Train Loss: 0.1468, Train Steps/Sec: 10.38, Grad Norm: 0.0059 +[2025-02-21 14:35:09] (step=0686500) Train Loss: 0.1460, Train Steps/Sec: 10.02, Grad Norm: 0.0063 +[2025-02-21 14:35:18] (step=0686600) Train Loss: 0.1467, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-21 14:35:26] (step=0686700) Train Loss: 0.1456, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 14:35:35] (step=0686800) Train Loss: 0.1459, Train Steps/Sec: 11.28, Grad Norm: 0.0057 +[2025-02-21 14:35:43] (step=0686900) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:35:52] (step=0687000) Train Loss: 0.1458, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 14:35:59] (step=0687100) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:36:07] (step=0687200) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 14:36:15] (step=0687300) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 14:36:23] (step=0687400) Train Loss: 0.1441, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:36:31] (step=0687500) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 14:36:39] (step=0687600) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 14:36:47] (step=0687700) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 14:36:55] (step=0687800) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 14:37:03] (step=0687900) Train Loss: 0.1458, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 14:37:11] (step=0688000) Train Loss: 0.1465, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 14:37:20] (step=0688100) Train Loss: 0.1463, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 14:37:28] (step=0688200) Train Loss: 0.1450, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 14:37:37] (step=0688300) Train Loss: 0.1448, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 14:37:45] (step=0688400) Train Loss: 0.1459, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 14:37:55] (step=0688500) Train Loss: 0.1467, Train Steps/Sec: 10.05, Grad Norm: 0.0059 +[2025-02-21 14:38:05] (step=0688600) Train Loss: 0.1452, Train Steps/Sec: 10.31, Grad Norm: 0.0057 +[2025-02-21 14:38:15] (step=0688700) Train Loss: 0.1463, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-21 14:38:24] (step=0688800) Train Loss: 0.1475, Train Steps/Sec: 10.95, Grad Norm: 0.0057 +[2025-02-21 14:38:32] (step=0688900) Train Loss: 0.1468, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 14:38:40] (step=0689000) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:38:48] (step=0689100) Train Loss: 0.1438, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 14:38:57] (step=0689200) Train Loss: 0.1444, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 14:39:04] (step=0689300) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 14:39:13] (step=0689400) Train Loss: 0.1462, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-21 14:39:21] (step=0689500) Train Loss: 0.1462, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 14:39:29] (step=0689600) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 14:39:37] (step=0689700) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:39:45] (step=0689800) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 14:39:53] (step=0689900) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:40:01] (step=0690000) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:40:09] (step=0690100) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:40:17] (step=0690200) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 14:40:25] (step=0690300) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 14:40:33] (step=0690400) Train Loss: 0.1470, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 14:40:42] (step=0690500) Train Loss: 0.1456, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 14:40:53] (step=0690600) Train Loss: 0.1457, Train Steps/Sec: 9.32, Grad Norm: 0.0060 +[2025-02-21 14:41:02] (step=0690700) Train Loss: 0.1451, Train Steps/Sec: 10.94, Grad Norm: 0.0064 +[2025-02-21 14:41:12] (step=0690800) Train Loss: 0.1457, Train Steps/Sec: 9.46, Grad Norm: 0.0062 +[2025-02-21 14:41:22] (step=0690900) Train Loss: 0.1459, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 14:41:30] (step=0691000) Train Loss: 0.1467, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 14:41:38] (step=0691100) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 14:41:46] (step=0691200) Train Loss: 0.1450, Train Steps/Sec: 12.65, Grad Norm: 0.0059 +[2025-02-21 14:41:54] (step=0691300) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:42:03] (step=0691400) Train Loss: 0.1448, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 14:42:11] (step=0691500) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:42:18] (step=0691600) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:42:26] (step=0691700) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:42:34] (step=0691800) Train Loss: 0.1454, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 14:42:43] (step=0691900) Train Loss: 0.1463, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-21 14:42:51] (step=0692000) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 14:42:59] (step=0692100) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:43:07] (step=0692200) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 14:43:15] (step=0692300) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 14:43:23] (step=0692400) Train Loss: 0.1449, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:43:31] (step=0692500) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 14:43:40] (step=0692600) Train Loss: 0.1472, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 14:43:48] (step=0692700) Train Loss: 0.1468, Train Steps/Sec: 11.65, Grad Norm: 0.0057 +[2025-02-21 14:43:58] (step=0692800) Train Loss: 0.1454, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-21 14:44:08] (step=0692900) Train Loss: 0.1460, Train Steps/Sec: 9.94, Grad Norm: 0.0059 +[2025-02-21 14:44:18] (step=0693000) Train Loss: 0.1457, Train Steps/Sec: 10.30, Grad Norm: 0.0061 +[2025-02-21 14:44:28] (step=0693100) Train Loss: 0.1454, Train Steps/Sec: 9.48, Grad Norm: 0.0057 +[2025-02-21 14:44:37] (step=0693200) Train Loss: 0.1456, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 14:44:45] (step=0693300) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 14:44:53] (step=0693400) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 14:45:01] (step=0693500) Train Loss: 0.1455, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 14:45:09] (step=0693600) Train Loss: 0.1470, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 14:45:17] (step=0693700) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:45:25] (step=0693800) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 14:45:33] (step=0693900) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 14:45:41] (step=0694000) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:45:49] (step=0694100) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:45:57] (step=0694200) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:46:05] (step=0694300) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 14:46:13] (step=0694400) Train Loss: 0.1463, Train Steps/Sec: 11.21, Grad Norm: 0.0057 +[2025-02-21 14:46:21] (step=0694500) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:46:29] (step=0694600) Train Loss: 0.1452, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 14:46:38] (step=0694700) Train Loss: 0.1462, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 14:46:46] (step=0694800) Train Loss: 0.1463, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 14:46:56] (step=0694900) Train Loss: 0.1469, Train Steps/Sec: 10.05, Grad Norm: 0.0057 +[2025-02-21 14:47:04] (step=0695000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:47:14] (step=0695100) Train Loss: 0.1453, Train Steps/Sec: 9.75, Grad Norm: 0.0057 +[2025-02-21 14:47:24] (step=0695200) Train Loss: 0.1466, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 14:47:33] (step=0695300) Train Loss: 0.1467, Train Steps/Sec: 10.91, Grad Norm: 0.0066 +[2025-02-21 14:47:42] (step=0695400) Train Loss: 0.1474, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 14:47:50] (step=0695500) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:47:59] (step=0695600) Train Loss: 0.1481, Train Steps/Sec: 11.13, Grad Norm: 0.0058 +[2025-02-21 14:48:07] (step=0695700) Train Loss: 0.1469, Train Steps/Sec: 12.08, Grad Norm: 0.0060 +[2025-02-21 14:48:15] (step=0695800) Train Loss: 0.1463, Train Steps/Sec: 12.12, Grad Norm: 0.0057 +[2025-02-21 14:48:23] (step=0695900) Train Loss: 0.1452, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 14:48:31] (step=0696000) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 14:48:39] (step=0696100) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 14:48:47] (step=0696200) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 14:48:55] (step=0696300) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 14:49:03] (step=0696400) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 14:49:11] (step=0696500) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:49:19] (step=0696600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 14:49:26] (step=0696700) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:49:35] (step=0696800) Train Loss: 0.1471, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 14:49:44] (step=0696900) Train Loss: 0.1458, Train Steps/Sec: 11.22, Grad Norm: 0.0058 +[2025-02-21 14:49:53] (step=0697000) Train Loss: 0.1452, Train Steps/Sec: 10.66, Grad Norm: 0.0058 +[2025-02-21 14:50:02] (step=0697100) Train Loss: 0.1449, Train Steps/Sec: 10.93, Grad Norm: 0.0058 +[2025-02-21 14:50:12] (step=0697200) Train Loss: 0.1459, Train Steps/Sec: 10.68, Grad Norm: 0.0061 +[2025-02-21 14:50:21] (step=0697300) Train Loss: 0.1458, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-21 14:50:30] (step=0697400) Train Loss: 0.1463, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 14:50:39] (step=0697500) Train Loss: 0.1462, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-21 14:50:48] (step=0697600) Train Loss: 0.1452, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 14:50:56] (step=0697700) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 14:51:04] (step=0697800) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 14:51:12] (step=0697900) Train Loss: 0.1441, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 14:51:20] (step=0698000) Train Loss: 0.1463, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 14:51:29] (step=0698100) Train Loss: 0.1463, Train Steps/Sec: 11.15, Grad Norm: 0.0060 +[2025-02-21 14:51:37] (step=0698200) Train Loss: 0.1467, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 14:51:45] (step=0698300) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 14:51:53] (step=0698400) Train Loss: 0.1476, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 14:52:01] (step=0698500) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:52:09] (step=0698600) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 14:52:17] (step=0698700) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 14:52:25] (step=0698800) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 14:52:33] (step=0698900) Train Loss: 0.1448, Train Steps/Sec: 12.18, Grad Norm: 0.0057 +[2025-02-21 14:52:41] (step=0699000) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:52:49] (step=0699100) Train Loss: 0.1462, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 14:52:58] (step=0699200) Train Loss: 0.1471, Train Steps/Sec: 10.65, Grad Norm: 0.0062 +[2025-02-21 14:53:08] (step=0699300) Train Loss: 0.1465, Train Steps/Sec: 10.62, Grad Norm: 0.0058 +[2025-02-21 14:53:18] (step=0699400) Train Loss: 0.1454, Train Steps/Sec: 10.25, Grad Norm: 0.0058 +[2025-02-21 14:53:27] (step=0699500) Train Loss: 0.1466, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-21 14:53:36] (step=0699600) Train Loss: 0.1451, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 14:53:46] (step=0699700) Train Loss: 0.1457, Train Steps/Sec: 10.57, Grad Norm: 0.0064 +[2025-02-21 14:53:54] (step=0699800) Train Loss: 0.1455, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 14:54:02] (step=0699900) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:54:10] (step=0700000) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 14:54:14] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0700000.pt +[2025-02-21 14:54:23] (step=0700100) Train Loss: 0.1462, Train Steps/Sec: 7.85, Grad Norm: 0.0057 +[2025-02-21 14:54:30] (step=0700200) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 14:54:38] (step=0700300) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 14:54:46] (step=0700400) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 14:54:54] (step=0700500) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:55:03] (step=0700600) Train Loss: 0.1455, Train Steps/Sec: 11.21, Grad Norm: 0.0057 +[2025-02-21 14:55:11] (step=0700700) Train Loss: 0.1438, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 14:55:19] (step=0700800) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 14:55:27] (step=0700900) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 14:55:35] (step=0701000) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 14:55:43] (step=0701100) Train Loss: 0.1465, Train Steps/Sec: 12.18, Grad Norm: 0.0057 +[2025-02-21 14:55:51] (step=0701200) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 14:56:00] (step=0701300) Train Loss: 0.1474, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 14:56:09] (step=0701400) Train Loss: 0.1451, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 14:56:18] (step=0701500) Train Loss: 0.1456, Train Steps/Sec: 10.64, Grad Norm: 0.0060 +[2025-02-21 14:56:27] (step=0701600) Train Loss: 0.1466, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 14:56:36] (step=0701700) Train Loss: 0.1463, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 14:56:46] (step=0701800) Train Loss: 0.1446, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-21 14:56:56] (step=0701900) Train Loss: 0.1461, Train Steps/Sec: 9.84, Grad Norm: 0.0059 +[2025-02-21 14:57:04] (step=0702000) Train Loss: 0.1463, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 14:57:12] (step=0702100) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 14:57:20] (step=0702200) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 14:57:29] (step=0702300) Train Loss: 0.1465, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 14:57:36] (step=0702400) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:57:44] (step=0702500) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 14:57:52] (step=0702600) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 14:58:00] (step=0702700) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 14:58:08] (step=0702800) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 14:58:16] (step=0702900) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 14:58:24] (step=0703000) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 14:58:33] (step=0703100) Train Loss: 0.1452, Train Steps/Sec: 11.25, Grad Norm: 0.0056 +[2025-02-21 14:58:41] (step=0703200) Train Loss: 0.1451, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 14:58:49] (step=0703300) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 14:58:58] (step=0703400) Train Loss: 0.1459, Train Steps/Sec: 11.34, Grad Norm: 0.0058 +[2025-02-21 14:59:06] (step=0703500) Train Loss: 0.1439, Train Steps/Sec: 12.14, Grad Norm: 0.0057 +[2025-02-21 14:59:16] (step=0703600) Train Loss: 0.1474, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 14:59:24] (step=0703700) Train Loss: 0.1466, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 14:59:33] (step=0703800) Train Loss: 0.1465, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 14:59:43] (step=0703900) Train Loss: 0.1470, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-21 14:59:52] (step=0704000) Train Loss: 0.1460, Train Steps/Sec: 11.32, Grad Norm: 0.0062 +[2025-02-21 15:00:01] (step=0704100) Train Loss: 0.1465, Train Steps/Sec: 10.54, Grad Norm: 0.0059 +[2025-02-21 15:00:09] (step=0704200) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 15:00:17] (step=0704300) Train Loss: 0.1462, Train Steps/Sec: 12.43, Grad Norm: 0.0057 +[2025-02-21 15:00:26] (step=0704400) Train Loss: 0.1462, Train Steps/Sec: 11.18, Grad Norm: 0.0059 +[2025-02-21 15:00:35] (step=0704500) Train Loss: 0.1450, Train Steps/Sec: 11.56, Grad Norm: 0.0062 +[2025-02-21 15:00:43] (step=0704600) Train Loss: 0.1461, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 15:00:51] (step=0704700) Train Loss: 0.1475, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 15:00:59] (step=0704800) Train Loss: 0.1462, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 15:01:07] (step=0704900) Train Loss: 0.1456, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 15:01:15] (step=0705000) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 15:01:22] (step=0705100) Train Loss: 0.1448, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 15:01:30] (step=0705200) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:01:39] (step=0705300) Train Loss: 0.1458, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 15:01:47] (step=0705400) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 15:01:55] (step=0705500) Train Loss: 0.1457, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 15:02:05] (step=0705600) Train Loss: 0.1482, Train Steps/Sec: 10.18, Grad Norm: 0.0060 +[2025-02-21 15:02:14] (step=0705700) Train Loss: 0.1467, Train Steps/Sec: 11.34, Grad Norm: 0.0057 +[2025-02-21 15:02:22] (step=0705800) Train Loss: 0.1447, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 15:02:31] (step=0705900) Train Loss: 0.1461, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 15:02:40] (step=0706000) Train Loss: 0.1453, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 15:02:49] (step=0706100) Train Loss: 0.1458, Train Steps/Sec: 10.30, Grad Norm: 0.0059 +[2025-02-21 15:02:59] (step=0706200) Train Loss: 0.1453, Train Steps/Sec: 10.62, Grad Norm: 0.0058 +[2025-02-21 15:03:08] (step=0706300) Train Loss: 0.1470, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 15:03:16] (step=0706400) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:03:24] (step=0706500) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:03:31] (step=0706600) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:03:40] (step=0706700) Train Loss: 0.1457, Train Steps/Sec: 11.71, Grad Norm: 0.0057 +[2025-02-21 15:03:48] (step=0706800) Train Loss: 0.1459, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 15:03:57] (step=0706900) Train Loss: 0.1450, Train Steps/Sec: 11.28, Grad Norm: 0.0056 +[2025-02-21 15:04:05] (step=0707000) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:04:13] (step=0707100) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 15:04:21] (step=0707200) Train Loss: 0.1446, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 15:04:29] (step=0707300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 15:04:36] (step=0707400) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 15:04:45] (step=0707500) Train Loss: 0.1452, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 15:04:53] (step=0707600) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 15:05:01] (step=0707700) Train Loss: 0.1464, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 15:05:10] (step=0707800) Train Loss: 0.1462, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 15:05:18] (step=0707900) Train Loss: 0.1460, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 15:05:27] (step=0708000) Train Loss: 0.1461, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 15:05:37] (step=0708100) Train Loss: 0.1462, Train Steps/Sec: 10.22, Grad Norm: 0.0061 +[2025-02-21 15:05:46] (step=0708200) Train Loss: 0.1471, Train Steps/Sec: 10.91, Grad Norm: 0.0059 +[2025-02-21 15:05:55] (step=0708300) Train Loss: 0.1459, Train Steps/Sec: 10.63, Grad Norm: 0.0057 +[2025-02-21 15:06:04] (step=0708400) Train Loss: 0.1460, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 15:06:13] (step=0708500) Train Loss: 0.1459, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 15:06:21] (step=0708600) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:06:29] (step=0708700) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:06:37] (step=0708800) Train Loss: 0.1465, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 15:06:45] (step=0708900) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 15:06:53] (step=0709000) Train Loss: 0.1440, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 15:07:01] (step=0709100) Train Loss: 0.1467, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 15:07:09] (step=0709200) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0069 +[2025-02-21 15:07:17] (step=0709300) Train Loss: 0.1464, Train Steps/Sec: 12.53, Grad Norm: 0.0062 +[2025-02-21 15:07:26] (step=0709400) Train Loss: 0.1460, Train Steps/Sec: 11.31, Grad Norm: 0.0057 +[2025-02-21 15:07:34] (step=0709500) Train Loss: 0.1453, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 15:07:42] (step=0709600) Train Loss: 0.1444, Train Steps/Sec: 12.20, Grad Norm: 0.0061 +[2025-02-21 15:07:50] (step=0709700) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 15:07:59] (step=0709800) Train Loss: 0.1455, Train Steps/Sec: 11.35, Grad Norm: 0.0063 +[2025-02-21 15:08:07] (step=0709900) Train Loss: 0.1465, Train Steps/Sec: 12.20, Grad Norm: 0.0056 +[2025-02-21 15:08:16] (step=0710000) Train Loss: 0.1474, Train Steps/Sec: 11.39, Grad Norm: 0.0057 +[2025-02-21 15:08:24] (step=0710100) Train Loss: 0.1465, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 15:08:32] (step=0710200) Train Loss: 0.1460, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 15:08:41] (step=0710300) Train Loss: 0.1459, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 15:08:50] (step=0710400) Train Loss: 0.1448, Train Steps/Sec: 10.98, Grad Norm: 0.0058 +[2025-02-21 15:09:00] (step=0710500) Train Loss: 0.1461, Train Steps/Sec: 10.32, Grad Norm: 0.0060 +[2025-02-21 15:09:10] (step=0710600) Train Loss: 0.1456, Train Steps/Sec: 10.18, Grad Norm: 0.0058 +[2025-02-21 15:09:19] (step=0710700) Train Loss: 0.1457, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 15:09:27] (step=0710800) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 15:09:35] (step=0710900) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 15:09:43] (step=0711000) Train Loss: 0.1457, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 15:09:51] (step=0711100) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 15:09:59] (step=0711200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:10:07] (step=0711300) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:10:15] (step=0711400) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:10:23] (step=0711500) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 15:10:31] (step=0711600) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 15:10:39] (step=0711700) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 15:10:47] (step=0711800) Train Loss: 0.1465, Train Steps/Sec: 12.08, Grad Norm: 0.0061 +[2025-02-21 15:10:56] (step=0711900) Train Loss: 0.1466, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-21 15:11:05] (step=0712000) Train Loss: 0.1441, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 15:11:13] (step=0712100) Train Loss: 0.1449, Train Steps/Sec: 11.81, Grad Norm: 0.0057 +[2025-02-21 15:11:22] (step=0712200) Train Loss: 0.1457, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 15:11:30] (step=0712300) Train Loss: 0.1458, Train Steps/Sec: 12.12, Grad Norm: 0.0064 +[2025-02-21 15:11:38] (step=0712400) Train Loss: 0.1446, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 15:11:47] (step=0712500) Train Loss: 0.1459, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 15:11:56] (step=0712600) Train Loss: 0.1478, Train Steps/Sec: 10.63, Grad Norm: 0.0060 +[2025-02-21 15:12:06] (step=0712700) Train Loss: 0.1459, Train Steps/Sec: 10.33, Grad Norm: 0.0063 +[2025-02-21 15:12:15] (step=0712800) Train Loss: 0.1462, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 15:12:23] (step=0712900) Train Loss: 0.1454, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 15:12:31] (step=0713000) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:12:40] (step=0713100) Train Loss: 0.1456, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-21 15:12:49] (step=0713200) Train Loss: 0.1460, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 15:12:57] (step=0713300) Train Loss: 0.1473, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 15:13:05] (step=0713400) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 15:13:13] (step=0713500) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 15:13:21] (step=0713600) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 15:13:28] (step=0713700) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:13:36] (step=0713800) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 15:13:45] (step=0713900) Train Loss: 0.1461, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 15:13:53] (step=0714000) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 15:14:01] (step=0714100) Train Loss: 0.1464, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 15:14:10] (step=0714200) Train Loss: 0.1471, Train Steps/Sec: 11.70, Grad Norm: 0.0063 +[2025-02-21 15:14:19] (step=0714300) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0062 +[2025-02-21 15:14:27] (step=0714400) Train Loss: 0.1454, Train Steps/Sec: 11.29, Grad Norm: 0.0070 +[2025-02-21 15:14:36] (step=0714500) Train Loss: 0.1453, Train Steps/Sec: 12.10, Grad Norm: 0.0060 +[2025-02-21 15:14:44] (step=0714600) Train Loss: 0.1452, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 15:14:53] (step=0714700) Train Loss: 0.1463, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 15:15:02] (step=0714800) Train Loss: 0.1459, Train Steps/Sec: 10.62, Grad Norm: 0.0060 +[2025-02-21 15:15:12] (step=0714900) Train Loss: 0.1458, Train Steps/Sec: 10.62, Grad Norm: 0.0060 +[2025-02-21 15:15:21] (step=0715000) Train Loss: 0.1455, Train Steps/Sec: 10.96, Grad Norm: 0.0062 +[2025-02-21 15:15:30] (step=0715100) Train Loss: 0.1455, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-21 15:15:37] (step=0715200) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 15:15:45] (step=0715300) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 15:15:54] (step=0715400) Train Loss: 0.1456, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 15:16:02] (step=0715500) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 15:16:11] (step=0715600) Train Loss: 0.1464, Train Steps/Sec: 11.13, Grad Norm: 0.0060 +[2025-02-21 15:16:19] (step=0715700) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 15:16:27] (step=0715800) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 15:16:35] (step=0715900) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 15:16:43] (step=0716000) Train Loss: 0.1456, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 15:16:51] (step=0716100) Train Loss: 0.1448, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 15:16:59] (step=0716200) Train Loss: 0.1449, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 15:17:08] (step=0716300) Train Loss: 0.1475, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 15:17:17] (step=0716400) Train Loss: 0.1467, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 15:17:25] (step=0716500) Train Loss: 0.1444, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:17:33] (step=0716600) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 15:17:41] (step=0716700) Train Loss: 0.1460, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 15:17:49] (step=0716800) Train Loss: 0.1458, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 15:17:59] (step=0716900) Train Loss: 0.1461, Train Steps/Sec: 10.25, Grad Norm: 0.0057 +[2025-02-21 15:18:09] (step=0717000) Train Loss: 0.1471, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-21 15:18:19] (step=0717100) Train Loss: 0.1463, Train Steps/Sec: 9.95, Grad Norm: 0.0064 +[2025-02-21 15:18:28] (step=0717200) Train Loss: 0.1463, Train Steps/Sec: 10.96, Grad Norm: 0.0058 +[2025-02-21 15:18:36] (step=0717300) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 15:18:44] (step=0717400) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 15:18:52] (step=0717500) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 15:19:00] (step=0717600) Train Loss: 0.1453, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 15:19:08] (step=0717700) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 15:19:16] (step=0717800) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 15:19:24] (step=0717900) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 15:19:32] (step=0718000) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 15:19:41] (step=0718100) Train Loss: 0.1464, Train Steps/Sec: 10.88, Grad Norm: 0.0058 +[2025-02-21 15:19:49] (step=0718200) Train Loss: 0.1457, Train Steps/Sec: 12.56, Grad Norm: 0.0065 +[2025-02-21 15:19:57] (step=0718300) Train Loss: 0.1448, Train Steps/Sec: 12.11, Grad Norm: 0.0058 +[2025-02-21 15:20:06] (step=0718400) Train Loss: 0.1459, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 15:20:15] (step=0718500) Train Loss: 0.1465, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 15:20:23] (step=0718600) Train Loss: 0.1449, Train Steps/Sec: 12.10, Grad Norm: 0.0063 +[2025-02-21 15:20:31] (step=0718700) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:20:39] (step=0718800) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 15:20:47] (step=0718900) Train Loss: 0.1457, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 15:20:56] (step=0719000) Train Loss: 0.1456, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 15:21:04] (step=0719100) Train Loss: 0.1456, Train Steps/Sec: 11.29, Grad Norm: 0.0057 +[2025-02-21 15:21:14] (step=0719200) Train Loss: 0.1448, Train Steps/Sec: 10.58, Grad Norm: 0.0057 +[2025-02-21 15:21:24] (step=0719300) Train Loss: 0.1466, Train Steps/Sec: 9.93, Grad Norm: 0.0063 +[2025-02-21 15:21:34] (step=0719400) Train Loss: 0.1451, Train Steps/Sec: 9.95, Grad Norm: 0.0059 +[2025-02-21 15:21:42] (step=0719500) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 15:21:50] (step=0719600) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:21:58] (step=0719700) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 15:22:06] (step=0719800) Train Loss: 0.1467, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 15:22:14] (step=0719900) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 15:22:22] (step=0720000) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:22:26] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0720000.pt +[2025-02-21 15:22:34] (step=0720100) Train Loss: 0.1455, Train Steps/Sec: 8.41, Grad Norm: 0.0059 +[2025-02-21 15:22:42] (step=0720200) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 15:22:50] (step=0720300) Train Loss: 0.1465, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 15:22:58] (step=0720400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 15:23:07] (step=0720500) Train Loss: 0.1454, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 15:23:16] (step=0720600) Train Loss: 0.1456, Train Steps/Sec: 10.58, Grad Norm: 0.0060 +[2025-02-21 15:23:25] (step=0720700) Train Loss: 0.1461, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-21 15:23:33] (step=0720800) Train Loss: 0.1461, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 15:23:41] (step=0720900) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:23:49] (step=0721000) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:23:57] (step=0721100) Train Loss: 0.1462, Train Steps/Sec: 12.10, Grad Norm: 0.0060 +[2025-02-21 15:24:06] (step=0721200) Train Loss: 0.1460, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 15:24:15] (step=0721300) Train Loss: 0.1458, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 15:24:25] (step=0721400) Train Loss: 0.1444, Train Steps/Sec: 10.03, Grad Norm: 0.0059 +[2025-02-21 15:24:34] (step=0721500) Train Loss: 0.1456, Train Steps/Sec: 10.61, Grad Norm: 0.0065 +[2025-02-21 15:24:43] (step=0721600) Train Loss: 0.1449, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-21 15:24:51] (step=0721700) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 15:24:59] (step=0721800) Train Loss: 0.1443, Train Steps/Sec: 12.52, Grad Norm: 0.0058 +[2025-02-21 15:25:08] (step=0721900) Train Loss: 0.1450, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-21 15:25:16] (step=0722000) Train Loss: 0.1466, Train Steps/Sec: 12.10, Grad Norm: 0.0062 +[2025-02-21 15:25:24] (step=0722100) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 15:25:32] (step=0722200) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:25:40] (step=0722300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 15:25:48] (step=0722400) Train Loss: 0.1451, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 15:25:56] (step=0722500) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 15:26:05] (step=0722600) Train Loss: 0.1458, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 15:26:13] (step=0722700) Train Loss: 0.1453, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 15:26:22] (step=0722800) Train Loss: 0.1471, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 15:26:30] (step=0722900) Train Loss: 0.1471, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:26:38] (step=0723000) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:26:47] (step=0723100) Train Loss: 0.1462, Train Steps/Sec: 11.20, Grad Norm: 0.0057 +[2025-02-21 15:26:55] (step=0723200) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:27:04] (step=0723300) Train Loss: 0.1455, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 15:27:12] (step=0723400) Train Loss: 0.1467, Train Steps/Sec: 11.72, Grad Norm: 0.0064 +[2025-02-21 15:27:21] (step=0723500) Train Loss: 0.1461, Train Steps/Sec: 11.31, Grad Norm: 0.0063 +[2025-02-21 15:27:31] (step=0723600) Train Loss: 0.1454, Train Steps/Sec: 9.63, Grad Norm: 0.0057 +[2025-02-21 15:27:41] (step=0723700) Train Loss: 0.1468, Train Steps/Sec: 10.58, Grad Norm: 0.0058 +[2025-02-21 15:27:49] (step=0723800) Train Loss: 0.1469, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 15:27:57] (step=0723900) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 15:28:05] (step=0724000) Train Loss: 0.1464, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 15:28:13] (step=0724100) Train Loss: 0.1468, Train Steps/Sec: 12.12, Grad Norm: 0.0058 +[2025-02-21 15:28:22] (step=0724200) Train Loss: 0.1467, Train Steps/Sec: 12.09, Grad Norm: 0.0063 +[2025-02-21 15:28:30] (step=0724300) Train Loss: 0.1459, Train Steps/Sec: 12.47, Grad Norm: 0.0059 +[2025-02-21 15:28:39] (step=0724400) Train Loss: 0.1449, Train Steps/Sec: 11.26, Grad Norm: 0.0061 +[2025-02-21 15:28:47] (step=0724500) Train Loss: 0.1461, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 15:28:55] (step=0724600) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 15:29:03] (step=0724700) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 15:29:12] (step=0724800) Train Loss: 0.1465, Train Steps/Sec: 10.99, Grad Norm: 0.0064 +[2025-02-21 15:29:20] (step=0724900) Train Loss: 0.1445, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 15:29:29] (step=0725000) Train Loss: 0.1459, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 15:29:36] (step=0725100) Train Loss: 0.1448, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 15:29:44] (step=0725200) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 15:29:52] (step=0725300) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 15:30:00] (step=0725400) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 15:30:09] (step=0725500) Train Loss: 0.1461, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 15:30:18] (step=0725600) Train Loss: 0.1466, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-21 15:30:27] (step=0725700) Train Loss: 0.1469, Train Steps/Sec: 10.93, Grad Norm: 0.0062 +[2025-02-21 15:30:38] (step=0725800) Train Loss: 0.1455, Train Steps/Sec: 9.65, Grad Norm: 0.0060 +[2025-02-21 15:30:47] (step=0725900) Train Loss: 0.1480, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-21 15:30:56] (step=0726000) Train Loss: 0.1469, Train Steps/Sec: 11.71, Grad Norm: 0.0061 +[2025-02-21 15:31:03] (step=0726100) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:31:11] (step=0726200) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 15:31:20] (step=0726300) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0057 +[2025-02-21 15:31:28] (step=0726400) Train Loss: 0.1465, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 15:31:36] (step=0726500) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 15:31:44] (step=0726600) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 15:31:52] (step=0726700) Train Loss: 0.1464, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 15:32:00] (step=0726800) Train Loss: 0.1453, Train Steps/Sec: 12.51, Grad Norm: 0.0056 +[2025-02-21 15:32:10] (step=0726900) Train Loss: 0.1458, Train Steps/Sec: 10.21, Grad Norm: 0.0059 +[2025-02-21 15:32:18] (step=0727000) Train Loss: 0.1463, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 15:32:27] (step=0727100) Train Loss: 0.1457, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 15:32:35] (step=0727200) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 15:32:43] (step=0727300) Train Loss: 0.1448, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 15:32:51] (step=0727400) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 15:32:59] (step=0727500) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 15:33:07] (step=0727600) Train Loss: 0.1440, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 15:33:15] (step=0727700) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0057 +[2025-02-21 15:33:24] (step=0727800) Train Loss: 0.1450, Train Steps/Sec: 11.70, Grad Norm: 0.0057 +[2025-02-21 15:33:32] (step=0727900) Train Loss: 0.1455, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 15:33:43] (step=0728000) Train Loss: 0.1458, Train Steps/Sec: 9.42, Grad Norm: 0.0060 +[2025-02-21 15:33:53] (step=0728100) Train Loss: 0.1463, Train Steps/Sec: 9.89, Grad Norm: 0.0059 +[2025-02-21 15:34:02] (step=0728200) Train Loss: 0.1460, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 15:34:10] (step=0728300) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:34:18] (step=0728400) Train Loss: 0.1455, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 15:34:26] (step=0728500) Train Loss: 0.1471, Train Steps/Sec: 11.73, Grad Norm: 0.0063 +[2025-02-21 15:34:34] (step=0728600) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:34:42] (step=0728700) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:34:50] (step=0728800) Train Loss: 0.1454, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 15:34:58] (step=0728900) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:35:06] (step=0729000) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 15:35:15] (step=0729100) Train Loss: 0.1450, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 15:35:23] (step=0729200) Train Loss: 0.1455, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 15:35:32] (step=0729300) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0062 +[2025-02-21 15:35:41] (step=0729400) Train Loss: 0.1457, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-21 15:35:49] (step=0729500) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:35:57] (step=0729600) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 15:36:05] (step=0729700) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 15:36:12] (step=0729800) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 15:36:21] (step=0729900) Train Loss: 0.1468, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 15:36:30] (step=0730000) Train Loss: 0.1448, Train Steps/Sec: 10.99, Grad Norm: 0.0059 +[2025-02-21 15:36:39] (step=0730100) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 15:36:49] (step=0730200) Train Loss: 0.1461, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-21 15:36:58] (step=0730300) Train Loss: 0.1449, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 15:37:06] (step=0730400) Train Loss: 0.1473, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 15:37:14] (step=0730500) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:37:23] (step=0730600) Train Loss: 0.1465, Train Steps/Sec: 10.90, Grad Norm: 0.0060 +[2025-02-21 15:37:32] (step=0730700) Train Loss: 0.1449, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 15:37:40] (step=0730800) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:37:48] (step=0730900) Train Loss: 0.1459, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 15:37:56] (step=0731000) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 15:38:04] (step=0731100) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 15:38:12] (step=0731200) Train Loss: 0.1454, Train Steps/Sec: 11.38, Grad Norm: 0.0062 +[2025-02-21 15:38:21] (step=0731300) Train Loss: 0.1471, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 15:38:29] (step=0731400) Train Loss: 0.1456, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 15:38:37] (step=0731500) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:38:45] (step=0731600) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 15:38:53] (step=0731700) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 15:39:01] (step=0731800) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 15:39:10] (step=0731900) Train Loss: 0.1457, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 15:39:18] (step=0732000) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 15:39:27] (step=0732100) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 15:39:36] (step=0732200) Train Loss: 0.1460, Train Steps/Sec: 11.01, Grad Norm: 0.0058 +[2025-02-21 15:39:45] (step=0732300) Train Loss: 0.1467, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-21 15:39:54] (step=0732400) Train Loss: 0.1464, Train Steps/Sec: 10.68, Grad Norm: 0.0057 +[2025-02-21 15:40:04] (step=0732500) Train Loss: 0.1478, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 15:40:12] (step=0732600) Train Loss: 0.1463, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 15:40:20] (step=0732700) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 15:40:28] (step=0732800) Train Loss: 0.1462, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 15:40:36] (step=0732900) Train Loss: 0.1453, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 15:40:44] (step=0733000) Train Loss: 0.1453, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 15:40:53] (step=0733100) Train Loss: 0.1447, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 15:41:01] (step=0733200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 15:41:10] (step=0733300) Train Loss: 0.1453, Train Steps/Sec: 11.32, Grad Norm: 0.0063 +[2025-02-21 15:41:18] (step=0733400) Train Loss: 0.1452, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 15:41:27] (step=0733500) Train Loss: 0.1453, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 15:41:35] (step=0733600) Train Loss: 0.1459, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 15:41:43] (step=0733700) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 15:41:51] (step=0733800) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:41:59] (step=0733900) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 15:42:07] (step=0734000) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 15:42:14] (step=0734100) Train Loss: 0.1464, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 15:42:22] (step=0734200) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:42:31] (step=0734300) Train Loss: 0.1464, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 15:42:41] (step=0734400) Train Loss: 0.1465, Train Steps/Sec: 9.94, Grad Norm: 0.0059 +[2025-02-21 15:42:51] (step=0734500) Train Loss: 0.1452, Train Steps/Sec: 9.70, Grad Norm: 0.0057 +[2025-02-21 15:43:00] (step=0734600) Train Loss: 0.1458, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 15:43:10] (step=0734700) Train Loss: 0.1458, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 15:43:17] (step=0734800) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:43:25] (step=0734900) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 15:43:34] (step=0735000) Train Loss: 0.1455, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 15:43:42] (step=0735100) Train Loss: 0.1458, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 15:43:50] (step=0735200) Train Loss: 0.1461, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 15:43:58] (step=0735300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 15:44:06] (step=0735400) Train Loss: 0.1452, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 15:44:15] (step=0735500) Train Loss: 0.1464, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 15:44:24] (step=0735600) Train Loss: 0.1461, Train Steps/Sec: 10.92, Grad Norm: 0.0066 +[2025-02-21 15:44:33] (step=0735700) Train Loss: 0.1463, Train Steps/Sec: 11.69, Grad Norm: 0.0061 +[2025-02-21 15:44:41] (step=0735800) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 15:44:49] (step=0735900) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 15:44:57] (step=0736000) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:45:05] (step=0736100) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 15:45:13] (step=0736200) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:45:20] (step=0736300) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 15:45:29] (step=0736400) Train Loss: 0.1463, Train Steps/Sec: 12.10, Grad Norm: 0.0063 +[2025-02-21 15:45:37] (step=0736500) Train Loss: 0.1451, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 15:45:46] (step=0736600) Train Loss: 0.1461, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-21 15:45:56] (step=0736700) Train Loss: 0.1457, Train Steps/Sec: 9.98, Grad Norm: 0.0059 +[2025-02-21 15:46:06] (step=0736800) Train Loss: 0.1462, Train Steps/Sec: 10.61, Grad Norm: 0.0059 +[2025-02-21 15:46:16] (step=0736900) Train Loss: 0.1454, Train Steps/Sec: 10.20, Grad Norm: 0.0062 +[2025-02-21 15:46:24] (step=0737000) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 15:46:32] (step=0737100) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 15:46:40] (step=0737200) Train Loss: 0.1466, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 15:46:48] (step=0737300) Train Loss: 0.1470, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 15:46:56] (step=0737400) Train Loss: 0.1465, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 15:47:04] (step=0737500) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 15:47:13] (step=0737600) Train Loss: 0.1465, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 15:47:21] (step=0737700) Train Loss: 0.1474, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 15:47:30] (step=0737800) Train Loss: 0.1439, Train Steps/Sec: 11.59, Grad Norm: 0.0059 +[2025-02-21 15:47:38] (step=0737900) Train Loss: 0.1456, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 15:47:46] (step=0738000) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 15:47:55] (step=0738100) Train Loss: 0.1463, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-21 15:48:03] (step=0738200) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:48:11] (step=0738300) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:48:19] (step=0738400) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:48:27] (step=0738500) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 15:48:35] (step=0738600) Train Loss: 0.1463, Train Steps/Sec: 12.12, Grad Norm: 0.0060 +[2025-02-21 15:48:44] (step=0738700) Train Loss: 0.1469, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 15:48:53] (step=0738800) Train Loss: 0.1458, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-21 15:49:03] (step=0738900) Train Loss: 0.1456, Train Steps/Sec: 9.91, Grad Norm: 0.0062 +[2025-02-21 15:49:12] (step=0739000) Train Loss: 0.1457, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-21 15:49:21] (step=0739100) Train Loss: 0.1452, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 15:49:29] (step=0739200) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 15:49:37] (step=0739300) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 15:49:46] (step=0739400) Train Loss: 0.1462, Train Steps/Sec: 10.86, Grad Norm: 0.0060 +[2025-02-21 15:49:55] (step=0739500) Train Loss: 0.1462, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 15:50:02] (step=0739600) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 15:50:11] (step=0739700) Train Loss: 0.1469, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 15:50:20] (step=0739800) Train Loss: 0.1453, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 15:50:28] (step=0739900) Train Loss: 0.1444, Train Steps/Sec: 12.17, Grad Norm: 0.0064 +[2025-02-21 15:50:36] (step=0740000) Train Loss: 0.1451, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 15:50:39] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0740000.pt +[2025-02-21 15:50:46] (step=0740100) Train Loss: 0.1461, Train Steps/Sec: 9.76, Grad Norm: 0.0060 +[2025-02-21 15:50:54] (step=0740200) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:51:02] (step=0740300) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 15:51:10] (step=0740400) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 15:51:18] (step=0740500) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 15:51:27] (step=0740600) Train Loss: 0.1466, Train Steps/Sec: 11.26, Grad Norm: 0.0058 +[2025-02-21 15:51:35] (step=0740700) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:51:43] (step=0740800) Train Loss: 0.1457, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 15:51:52] (step=0740900) Train Loss: 0.1454, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 15:52:01] (step=0741000) Train Loss: 0.1451, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 15:52:11] (step=0741100) Train Loss: 0.1441, Train Steps/Sec: 10.35, Grad Norm: 0.0060 +[2025-02-21 15:52:21] (step=0741200) Train Loss: 0.1464, Train Steps/Sec: 10.30, Grad Norm: 0.0059 +[2025-02-21 15:52:29] (step=0741300) Train Loss: 0.1455, Train Steps/Sec: 11.72, Grad Norm: 0.0064 +[2025-02-21 15:52:37] (step=0741400) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 15:52:45] (step=0741500) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 15:52:53] (step=0741600) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 15:53:02] (step=0741700) Train Loss: 0.1468, Train Steps/Sec: 11.74, Grad Norm: 0.0057 +[2025-02-21 15:53:10] (step=0741800) Train Loss: 0.1442, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 15:53:20] (step=0741900) Train Loss: 0.1453, Train Steps/Sec: 9.97, Grad Norm: 0.0061 +[2025-02-21 15:53:28] (step=0742000) Train Loss: 0.1451, Train Steps/Sec: 11.71, Grad Norm: 0.0057 +[2025-02-21 15:53:36] (step=0742100) Train Loss: 0.1462, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 15:53:44] (step=0742200) Train Loss: 0.1470, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 15:53:52] (step=0742300) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 15:54:00] (step=0742400) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 15:54:08] (step=0742500) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 15:54:16] (step=0742600) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:54:24] (step=0742700) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0056 +[2025-02-21 15:54:32] (step=0742800) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 15:54:40] (step=0742900) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 15:54:48] (step=0743000) Train Loss: 0.1464, Train Steps/Sec: 12.07, Grad Norm: 0.0058 +[2025-02-21 15:54:58] (step=0743100) Train Loss: 0.1464, Train Steps/Sec: 9.96, Grad Norm: 0.0058 +[2025-02-21 15:55:08] (step=0743200) Train Loss: 0.1452, Train Steps/Sec: 10.86, Grad Norm: 0.0060 +[2025-02-21 15:55:18] (step=0743300) Train Loss: 0.1461, Train Steps/Sec: 10.01, Grad Norm: 0.0058 +[2025-02-21 15:55:27] (step=0743400) Train Loss: 0.1458, Train Steps/Sec: 10.91, Grad Norm: 0.0058 +[2025-02-21 15:55:35] (step=0743500) Train Loss: 0.1447, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 15:55:43] (step=0743600) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 15:55:51] (step=0743700) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 15:56:00] (step=0743800) Train Loss: 0.1460, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 15:56:08] (step=0743900) Train Loss: 0.1467, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 15:56:17] (step=0744000) Train Loss: 0.1459, Train Steps/Sec: 11.36, Grad Norm: 0.0065 +[2025-02-21 15:56:25] (step=0744100) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 15:56:33] (step=0744200) Train Loss: 0.1451, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 15:56:42] (step=0744300) Train Loss: 0.1462, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 15:56:51] (step=0744400) Train Loss: 0.1456, Train Steps/Sec: 11.22, Grad Norm: 0.0059 +[2025-02-21 15:56:59] (step=0744500) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:57:06] (step=0744600) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 15:57:14] (step=0744700) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 15:57:22] (step=0744800) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 15:57:30] (step=0744900) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 15:57:38] (step=0745000) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 15:57:46] (step=0745100) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 15:57:54] (step=0745200) Train Loss: 0.1465, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 15:58:03] (step=0745300) Train Loss: 0.1462, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 15:58:13] (step=0745400) Train Loss: 0.1459, Train Steps/Sec: 10.87, Grad Norm: 0.0058 +[2025-02-21 15:58:23] (step=0745500) Train Loss: 0.1483, Train Steps/Sec: 10.00, Grad Norm: 0.0068 +[2025-02-21 15:58:33] (step=0745600) Train Loss: 0.1452, Train Steps/Sec: 9.63, Grad Norm: 0.0059 +[2025-02-21 15:58:41] (step=0745700) Train Loss: 0.1455, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 15:58:49] (step=0745800) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 15:58:58] (step=0745900) Train Loss: 0.1456, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 15:59:06] (step=0746000) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 15:59:14] (step=0746100) Train Loss: 0.1458, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 15:59:23] (step=0746200) Train Loss: 0.1456, Train Steps/Sec: 11.39, Grad Norm: 0.0058 +[2025-02-21 15:59:31] (step=0746300) Train Loss: 0.1454, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 15:59:40] (step=0746400) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 15:59:48] (step=0746500) Train Loss: 0.1450, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 15:59:56] (step=0746600) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 16:00:04] (step=0746700) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 16:00:11] (step=0746800) Train Loss: 0.1463, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 16:00:20] (step=0746900) Train Loss: 0.1452, Train Steps/Sec: 11.16, Grad Norm: 0.0060 +[2025-02-21 16:00:28] (step=0747000) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:00:36] (step=0747100) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 16:00:44] (step=0747200) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 16:00:52] (step=0747300) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 16:01:01] (step=0747400) Train Loss: 0.1463, Train Steps/Sec: 12.08, Grad Norm: 0.0065 +[2025-02-21 16:01:10] (step=0747500) Train Loss: 0.1451, Train Steps/Sec: 10.55, Grad Norm: 0.0062 +[2025-02-21 16:01:19] (step=0747600) Train Loss: 0.1444, Train Steps/Sec: 10.56, Grad Norm: 0.0063 +[2025-02-21 16:01:30] (step=0747700) Train Loss: 0.1466, Train Steps/Sec: 9.69, Grad Norm: 0.0057 +[2025-02-21 16:01:38] (step=0747800) Train Loss: 0.1456, Train Steps/Sec: 11.71, Grad Norm: 0.0065 +[2025-02-21 16:01:47] (step=0747900) Train Loss: 0.1454, Train Steps/Sec: 12.09, Grad Norm: 0.0061 +[2025-02-21 16:01:55] (step=0748000) Train Loss: 0.1458, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 16:02:04] (step=0748100) Train Loss: 0.1449, Train Steps/Sec: 10.52, Grad Norm: 0.0062 +[2025-02-21 16:02:13] (step=0748200) Train Loss: 0.1468, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 16:02:21] (step=0748300) Train Loss: 0.1450, Train Steps/Sec: 11.33, Grad Norm: 0.0062 +[2025-02-21 16:02:30] (step=0748400) Train Loss: 0.1461, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 16:02:38] (step=0748500) Train Loss: 0.1469, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 16:02:46] (step=0748600) Train Loss: 0.1452, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 16:02:54] (step=0748700) Train Loss: 0.1453, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 16:03:02] (step=0748800) Train Loss: 0.1457, Train Steps/Sec: 12.56, Grad Norm: 0.0065 +[2025-02-21 16:03:10] (step=0748900) Train Loss: 0.1462, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 16:03:18] (step=0749000) Train Loss: 0.1457, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 16:03:26] (step=0749100) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 16:03:34] (step=0749200) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 16:03:42] (step=0749300) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:03:51] (step=0749400) Train Loss: 0.1462, Train Steps/Sec: 11.17, Grad Norm: 0.0059 +[2025-02-21 16:03:59] (step=0749500) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:04:07] (step=0749600) Train Loss: 0.1457, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 16:04:16] (step=0749700) Train Loss: 0.1457, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-21 16:04:26] (step=0749800) Train Loss: 0.1452, Train Steps/Sec: 10.56, Grad Norm: 0.0060 +[2025-02-21 16:04:36] (step=0749900) Train Loss: 0.1465, Train Steps/Sec: 9.98, Grad Norm: 0.0061 +[2025-02-21 16:04:45] (step=0750000) Train Loss: 0.1450, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 16:04:53] (step=0750100) Train Loss: 0.1459, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 16:05:01] (step=0750200) Train Loss: 0.1471, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 16:05:09] (step=0750300) Train Loss: 0.1457, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 16:05:18] (step=0750400) Train Loss: 0.1471, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 16:05:27] (step=0750500) Train Loss: 0.1471, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 16:05:35] (step=0750600) Train Loss: 0.1451, Train Steps/Sec: 11.57, Grad Norm: 0.0060 +[2025-02-21 16:05:44] (step=0750700) Train Loss: 0.1453, Train Steps/Sec: 11.37, Grad Norm: 0.0056 +[2025-02-21 16:05:52] (step=0750800) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 16:06:00] (step=0750900) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 16:06:08] (step=0751000) Train Loss: 0.1456, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 16:06:16] (step=0751100) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 16:06:24] (step=0751200) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:06:32] (step=0751300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:06:40] (step=0751400) Train Loss: 0.1447, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 16:06:48] (step=0751500) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:06:56] (step=0751600) Train Loss: 0.1460, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 16:07:04] (step=0751700) Train Loss: 0.1448, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 16:07:12] (step=0751800) Train Loss: 0.1457, Train Steps/Sec: 12.10, Grad Norm: 0.0056 +[2025-02-21 16:07:23] (step=0751900) Train Loss: 0.1463, Train Steps/Sec: 9.53, Grad Norm: 0.0059 +[2025-02-21 16:07:33] (step=0752000) Train Loss: 0.1450, Train Steps/Sec: 9.93, Grad Norm: 0.0059 +[2025-02-21 16:07:42] (step=0752100) Train Loss: 0.1457, Train Steps/Sec: 10.21, Grad Norm: 0.0059 +[2025-02-21 16:07:51] (step=0752200) Train Loss: 0.1459, Train Steps/Sec: 11.71, Grad Norm: 0.0063 +[2025-02-21 16:07:59] (step=0752300) Train Loss: 0.1450, Train Steps/Sec: 12.06, Grad Norm: 0.0060 +[2025-02-21 16:08:07] (step=0752400) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 16:08:16] (step=0752500) Train Loss: 0.1467, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 16:08:25] (step=0752600) Train Loss: 0.1472, Train Steps/Sec: 11.05, Grad Norm: 0.0059 +[2025-02-21 16:08:33] (step=0752700) Train Loss: 0.1448, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 16:08:41] (step=0752800) Train Loss: 0.1452, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 16:08:49] (step=0752900) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 16:08:57] (step=0753000) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:09:05] (step=0753100) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 16:09:14] (step=0753200) Train Loss: 0.1456, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-21 16:09:22] (step=0753300) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 16:09:30] (step=0753400) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 16:09:38] (step=0753500) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:09:46] (step=0753600) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 16:09:54] (step=0753700) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 16:10:02] (step=0753800) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:10:10] (step=0753900) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 16:10:18] (step=0754000) Train Loss: 0.1459, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 16:10:27] (step=0754100) Train Loss: 0.1466, Train Steps/Sec: 10.91, Grad Norm: 0.0059 +[2025-02-21 16:10:38] (step=0754200) Train Loss: 0.1455, Train Steps/Sec: 9.92, Grad Norm: 0.0061 +[2025-02-21 16:10:48] (step=0754300) Train Loss: 0.1455, Train Steps/Sec: 9.97, Grad Norm: 0.0060 +[2025-02-21 16:10:57] (step=0754400) Train Loss: 0.1456, Train Steps/Sec: 10.18, Grad Norm: 0.0060 +[2025-02-21 16:11:05] (step=0754500) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:11:14] (step=0754600) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 16:11:23] (step=0754700) Train Loss: 0.1468, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 16:11:31] (step=0754800) Train Loss: 0.1447, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 16:11:40] (step=0754900) Train Loss: 0.1445, Train Steps/Sec: 11.80, Grad Norm: 0.0060 +[2025-02-21 16:11:48] (step=0755000) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 16:11:55] (step=0755100) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 16:12:03] (step=0755200) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 16:12:11] (step=0755300) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0056 +[2025-02-21 16:12:19] (step=0755400) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 16:12:27] (step=0755500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:12:35] (step=0755600) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 16:12:44] (step=0755700) Train Loss: 0.1446, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 16:12:52] (step=0755800) Train Loss: 0.1445, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 16:13:00] (step=0755900) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:13:08] (step=0756000) Train Loss: 0.1445, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 16:13:16] (step=0756100) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 16:13:24] (step=0756200) Train Loss: 0.1457, Train Steps/Sec: 11.62, Grad Norm: 0.0060 +[2025-02-21 16:13:34] (step=0756300) Train Loss: 0.1461, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 16:13:44] (step=0756400) Train Loss: 0.1458, Train Steps/Sec: 9.71, Grad Norm: 0.0058 +[2025-02-21 16:13:54] (step=0756500) Train Loss: 0.1460, Train Steps/Sec: 10.30, Grad Norm: 0.0060 +[2025-02-21 16:14:02] (step=0756600) Train Loss: 0.1451, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 16:14:10] (step=0756700) Train Loss: 0.1474, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 16:14:19] (step=0756800) Train Loss: 0.1474, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 16:14:29] (step=0756900) Train Loss: 0.1453, Train Steps/Sec: 10.27, Grad Norm: 0.0062 +[2025-02-21 16:14:37] (step=0757000) Train Loss: 0.1467, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 16:14:45] (step=0757100) Train Loss: 0.1453, Train Steps/Sec: 12.17, Grad Norm: 0.0057 +[2025-02-21 16:14:53] (step=0757200) Train Loss: 0.1445, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 16:15:01] (step=0757300) Train Loss: 0.1448, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 16:15:09] (step=0757400) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 16:15:17] (step=0757500) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 16:15:25] (step=0757600) Train Loss: 0.1446, Train Steps/Sec: 12.55, Grad Norm: 0.0057 +[2025-02-21 16:15:33] (step=0757700) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:15:41] (step=0757800) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:15:49] (step=0757900) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 16:15:57] (step=0758000) Train Loss: 0.1450, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 16:16:05] (step=0758100) Train Loss: 0.1459, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 16:16:14] (step=0758200) Train Loss: 0.1462, Train Steps/Sec: 11.20, Grad Norm: 0.0060 +[2025-02-21 16:16:22] (step=0758300) Train Loss: 0.1456, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:16:30] (step=0758400) Train Loss: 0.1456, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 16:16:40] (step=0758500) Train Loss: 0.1463, Train Steps/Sec: 9.91, Grad Norm: 0.0062 +[2025-02-21 16:16:50] (step=0758600) Train Loss: 0.1454, Train Steps/Sec: 10.66, Grad Norm: 0.0060 +[2025-02-21 16:17:00] (step=0758700) Train Loss: 0.1465, Train Steps/Sec: 9.94, Grad Norm: 0.0061 +[2025-02-21 16:17:08] (step=0758800) Train Loss: 0.1454, Train Steps/Sec: 11.70, Grad Norm: 0.0058 +[2025-02-21 16:17:16] (step=0758900) Train Loss: 0.1454, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 16:17:25] (step=0759000) Train Loss: 0.1451, Train Steps/Sec: 11.04, Grad Norm: 0.0058 +[2025-02-21 16:17:34] (step=0759100) Train Loss: 0.1465, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 16:17:42] (step=0759200) Train Loss: 0.1447, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 16:17:50] (step=0759300) Train Loss: 0.1445, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:17:59] (step=0759400) Train Loss: 0.1469, Train Steps/Sec: 11.14, Grad Norm: 0.0061 +[2025-02-21 16:18:07] (step=0759500) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 16:18:15] (step=0759600) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:18:23] (step=0759700) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 16:18:31] (step=0759800) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:18:39] (step=0759900) Train Loss: 0.1449, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:18:47] (step=0760000) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:18:50] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0760000.pt +[2025-02-21 16:18:58] (step=0760100) Train Loss: 0.1460, Train Steps/Sec: 9.07, Grad Norm: 0.0058 +[2025-02-21 16:19:06] (step=0760200) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 16:19:14] (step=0760300) Train Loss: 0.1445, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:19:22] (step=0760400) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:19:30] (step=0760500) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 16:19:38] (step=0760600) Train Loss: 0.1456, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 16:19:50] (step=0760700) Train Loss: 0.1460, Train Steps/Sec: 8.80, Grad Norm: 0.0060 +[2025-02-21 16:20:00] (step=0760800) Train Loss: 0.1453, Train Steps/Sec: 10.02, Grad Norm: 0.0057 +[2025-02-21 16:20:09] (step=0760900) Train Loss: 0.1463, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-21 16:20:18] (step=0761000) Train Loss: 0.1463, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 16:20:26] (step=0761100) Train Loss: 0.1450, Train Steps/Sec: 11.38, Grad Norm: 0.0057 +[2025-02-21 16:20:35] (step=0761200) Train Loss: 0.1451, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 16:20:44] (step=0761300) Train Loss: 0.1459, Train Steps/Sec: 11.41, Grad Norm: 0.0057 +[2025-02-21 16:20:52] (step=0761400) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 16:21:00] (step=0761500) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:21:08] (step=0761600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:21:15] (step=0761700) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:21:23] (step=0761800) Train Loss: 0.1459, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 16:21:32] (step=0761900) Train Loss: 0.1474, Train Steps/Sec: 11.21, Grad Norm: 0.0062 +[2025-02-21 16:21:40] (step=0762000) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:21:48] (step=0762100) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 16:21:56] (step=0762200) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 16:22:04] (step=0762300) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 16:22:12] (step=0762400) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 16:22:20] (step=0762500) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 16:22:28] (step=0762600) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 16:22:36] (step=0762700) Train Loss: 0.1453, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 16:22:44] (step=0762800) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 16:22:55] (step=0762900) Train Loss: 0.1450, Train Steps/Sec: 9.34, Grad Norm: 0.0057 +[2025-02-21 16:23:05] (step=0763000) Train Loss: 0.1459, Train Steps/Sec: 10.35, Grad Norm: 0.0060 +[2025-02-21 16:23:14] (step=0763100) Train Loss: 0.1453, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-21 16:23:24] (step=0763200) Train Loss: 0.1464, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-21 16:23:33] (step=0763300) Train Loss: 0.1470, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 16:23:41] (step=0763400) Train Loss: 0.1450, Train Steps/Sec: 11.78, Grad Norm: 0.0058 +[2025-02-21 16:23:50] (step=0763500) Train Loss: 0.1462, Train Steps/Sec: 11.78, Grad Norm: 0.0062 +[2025-02-21 16:23:58] (step=0763600) Train Loss: 0.1447, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:24:06] (step=0763700) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:24:13] (step=0763800) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 16:24:21] (step=0763900) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 16:24:29] (step=0764000) Train Loss: 0.1437, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:24:37] (step=0764100) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:24:45] (step=0764200) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 16:24:53] (step=0764300) Train Loss: 0.1455, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 16:25:02] (step=0764400) Train Loss: 0.1456, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-21 16:25:10] (step=0764500) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 16:25:18] (step=0764600) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 16:25:26] (step=0764700) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 16:25:34] (step=0764800) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 16:25:42] (step=0764900) Train Loss: 0.1458, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 16:25:51] (step=0765000) Train Loss: 0.1470, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 16:26:01] (step=0765100) Train Loss: 0.1473, Train Steps/Sec: 9.46, Grad Norm: 0.0061 +[2025-02-21 16:26:11] (step=0765200) Train Loss: 0.1454, Train Steps/Sec: 10.65, Grad Norm: 0.0060 +[2025-02-21 16:26:20] (step=0765300) Train Loss: 0.1462, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 16:26:29] (step=0765400) Train Loss: 0.1469, Train Steps/Sec: 10.65, Grad Norm: 0.0060 +[2025-02-21 16:26:38] (step=0765500) Train Loss: 0.1459, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 16:26:46] (step=0765600) Train Loss: 0.1466, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 16:26:56] (step=0765700) Train Loss: 0.1461, Train Steps/Sec: 10.89, Grad Norm: 0.0062 +[2025-02-21 16:27:03] (step=0765800) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:27:11] (step=0765900) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:27:19] (step=0766000) Train Loss: 0.1447, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:27:27] (step=0766100) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 16:27:35] (step=0766200) Train Loss: 0.1460, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 16:27:43] (step=0766300) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 16:27:51] (step=0766400) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 16:27:59] (step=0766500) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 16:28:07] (step=0766600) Train Loss: 0.1442, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:28:15] (step=0766700) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 16:28:23] (step=0766800) Train Loss: 0.1453, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 16:28:32] (step=0766900) Train Loss: 0.1452, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 16:28:40] (step=0767000) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:28:48] (step=0767100) Train Loss: 0.1463, Train Steps/Sec: 12.11, Grad Norm: 0.0058 +[2025-02-21 16:28:57] (step=0767200) Train Loss: 0.1473, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 16:29:07] (step=0767300) Train Loss: 0.1452, Train Steps/Sec: 9.96, Grad Norm: 0.0057 +[2025-02-21 16:29:17] (step=0767400) Train Loss: 0.1452, Train Steps/Sec: 10.32, Grad Norm: 0.0058 +[2025-02-21 16:29:27] (step=0767500) Train Loss: 0.1466, Train Steps/Sec: 10.01, Grad Norm: 0.0058 +[2025-02-21 16:29:35] (step=0767600) Train Loss: 0.1450, Train Steps/Sec: 11.78, Grad Norm: 0.0057 +[2025-02-21 16:29:44] (step=0767700) Train Loss: 0.1457, Train Steps/Sec: 11.42, Grad Norm: 0.0058 +[2025-02-21 16:29:52] (step=0767800) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 16:30:00] (step=0767900) Train Loss: 0.1478, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 16:30:08] (step=0768000) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 16:30:16] (step=0768100) Train Loss: 0.1463, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 16:30:25] (step=0768200) Train Loss: 0.1456, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-21 16:30:33] (step=0768300) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 16:30:41] (step=0768400) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 16:30:49] (step=0768500) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0065 +[2025-02-21 16:30:57] (step=0768600) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:31:05] (step=0768700) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 16:31:13] (step=0768800) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0056 +[2025-02-21 16:31:20] (step=0768900) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:31:28] (step=0769000) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 16:31:36] (step=0769100) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 16:31:44] (step=0769200) Train Loss: 0.1447, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 16:31:53] (step=0769300) Train Loss: 0.1455, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 16:32:03] (step=0769400) Train Loss: 0.1466, Train Steps/Sec: 9.11, Grad Norm: 0.0059 +[2025-02-21 16:32:13] (step=0769500) Train Loss: 0.1445, Train Steps/Sec: 10.55, Grad Norm: 0.0064 +[2025-02-21 16:32:23] (step=0769600) Train Loss: 0.1471, Train Steps/Sec: 9.71, Grad Norm: 0.0061 +[2025-02-21 16:32:33] (step=0769700) Train Loss: 0.1454, Train Steps/Sec: 10.30, Grad Norm: 0.0065 +[2025-02-21 16:32:41] (step=0769800) Train Loss: 0.1457, Train Steps/Sec: 11.80, Grad Norm: 0.0057 +[2025-02-21 16:32:50] (step=0769900) Train Loss: 0.1476, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 16:32:58] (step=0770000) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 16:33:06] (step=0770100) Train Loss: 0.1448, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 16:33:14] (step=0770200) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 16:33:22] (step=0770300) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:33:30] (step=0770400) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 16:33:38] (step=0770500) Train Loss: 0.1481, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:33:45] (step=0770600) Train Loss: 0.1445, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 16:33:54] (step=0770700) Train Loss: 0.1452, Train Steps/Sec: 11.30, Grad Norm: 0.0058 +[2025-02-21 16:34:02] (step=0770800) Train Loss: 0.1450, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 16:34:10] (step=0770900) Train Loss: 0.1478, Train Steps/Sec: 12.54, Grad Norm: 0.0066 +[2025-02-21 16:34:18] (step=0771000) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:34:26] (step=0771100) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:34:34] (step=0771200) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:34:42] (step=0771300) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 16:34:50] (step=0771400) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:34:58] (step=0771500) Train Loss: 0.1459, Train Steps/Sec: 12.09, Grad Norm: 0.0061 +[2025-02-21 16:35:08] (step=0771600) Train Loss: 0.1460, Train Steps/Sec: 10.30, Grad Norm: 0.0062 +[2025-02-21 16:35:18] (step=0771700) Train Loss: 0.1473, Train Steps/Sec: 9.88, Grad Norm: 0.0057 +[2025-02-21 16:35:29] (step=0771800) Train Loss: 0.1466, Train Steps/Sec: 9.46, Grad Norm: 0.0060 +[2025-02-21 16:35:39] (step=0771900) Train Loss: 0.1470, Train Steps/Sec: 9.68, Grad Norm: 0.0061 +[2025-02-21 16:35:47] (step=0772000) Train Loss: 0.1447, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 16:35:55] (step=0772100) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0065 +[2025-02-21 16:36:03] (step=0772200) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:36:12] (step=0772300) Train Loss: 0.1457, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 16:36:19] (step=0772400) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 16:36:27] (step=0772500) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 16:36:35] (step=0772600) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:36:43] (step=0772700) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 16:36:51] (step=0772800) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:36:59] (step=0772900) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:37:07] (step=0773000) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:37:15] (step=0773100) Train Loss: 0.1462, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 16:37:24] (step=0773200) Train Loss: 0.1459, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-21 16:37:32] (step=0773300) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:37:40] (step=0773400) Train Loss: 0.1463, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:37:48] (step=0773500) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 16:37:56] (step=0773600) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 16:38:04] (step=0773700) Train Loss: 0.1472, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 16:38:14] (step=0773800) Train Loss: 0.1451, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-21 16:38:24] (step=0773900) Train Loss: 0.1457, Train Steps/Sec: 9.99, Grad Norm: 0.0059 +[2025-02-21 16:38:35] (step=0774000) Train Loss: 0.1464, Train Steps/Sec: 9.47, Grad Norm: 0.0062 +[2025-02-21 16:38:44] (step=0774100) Train Loss: 0.1449, Train Steps/Sec: 11.03, Grad Norm: 0.0060 +[2025-02-21 16:38:52] (step=0774200) Train Loss: 0.1456, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 16:39:00] (step=0774300) Train Loss: 0.1451, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 16:39:09] (step=0774400) Train Loss: 0.1451, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-21 16:39:17] (step=0774500) Train Loss: 0.1451, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 16:39:25] (step=0774600) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 16:39:33] (step=0774700) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:39:41] (step=0774800) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 16:39:49] (step=0774900) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 16:39:57] (step=0775000) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 16:40:05] (step=0775100) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 16:40:13] (step=0775200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 16:40:21] (step=0775300) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:40:29] (step=0775400) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0056 +[2025-02-21 16:40:37] (step=0775500) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 16:40:45] (step=0775600) Train Loss: 0.1459, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 16:40:53] (step=0775700) Train Loss: 0.1457, Train Steps/Sec: 11.32, Grad Norm: 0.0058 +[2025-02-21 16:41:02] (step=0775800) Train Loss: 0.1467, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 16:41:11] (step=0775900) Train Loss: 0.1462, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-21 16:41:21] (step=0776000) Train Loss: 0.1454, Train Steps/Sec: 9.72, Grad Norm: 0.0059 +[2025-02-21 16:41:32] (step=0776100) Train Loss: 0.1450, Train Steps/Sec: 9.19, Grad Norm: 0.0061 +[2025-02-21 16:41:41] (step=0776200) Train Loss: 0.1462, Train Steps/Sec: 10.38, Grad Norm: 0.0058 +[2025-02-21 16:41:50] (step=0776300) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 16:41:58] (step=0776400) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:42:06] (step=0776500) Train Loss: 0.1462, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 16:42:14] (step=0776600) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 16:42:22] (step=0776700) Train Loss: 0.1456, Train Steps/Sec: 12.19, Grad Norm: 0.0063 +[2025-02-21 16:42:30] (step=0776800) Train Loss: 0.1443, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 16:42:39] (step=0776900) Train Loss: 0.1467, Train Steps/Sec: 11.22, Grad Norm: 0.0058 +[2025-02-21 16:42:47] (step=0777000) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:42:55] (step=0777100) Train Loss: 0.1444, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:43:03] (step=0777200) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:43:11] (step=0777300) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 16:43:19] (step=0777400) Train Loss: 0.1451, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 16:43:27] (step=0777500) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 16:43:35] (step=0777600) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 16:43:42] (step=0777700) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:43:50] (step=0777800) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:43:58] (step=0777900) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 16:44:06] (step=0778000) Train Loss: 0.1466, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 16:44:15] (step=0778100) Train Loss: 0.1449, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-21 16:44:28] (step=0778200) Train Loss: 0.1464, Train Steps/Sec: 8.23, Grad Norm: 0.0062 +[2025-02-21 16:44:38] (step=0778300) Train Loss: 0.1474, Train Steps/Sec: 9.97, Grad Norm: 0.0059 +[2025-02-21 16:44:48] (step=0778400) Train Loss: 0.1451, Train Steps/Sec: 9.99, Grad Norm: 0.0058 +[2025-02-21 16:44:56] (step=0778500) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 16:45:03] (step=0778600) Train Loss: 0.1444, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 16:45:12] (step=0778700) Train Loss: 0.1452, Train Steps/Sec: 12.16, Grad Norm: 0.0062 +[2025-02-21 16:45:20] (step=0778800) Train Loss: 0.1445, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 16:45:28] (step=0778900) Train Loss: 0.1455, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 16:45:36] (step=0779000) Train Loss: 0.1452, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 16:45:44] (step=0779100) Train Loss: 0.1469, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 16:45:52] (step=0779200) Train Loss: 0.1463, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 16:46:00] (step=0779300) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:46:09] (step=0779400) Train Loss: 0.1456, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-21 16:46:17] (step=0779500) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:46:24] (step=0779600) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:46:32] (step=0779700) Train Loss: 0.1446, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 16:46:40] (step=0779800) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 16:46:48] (step=0779900) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:46:56] (step=0780000) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:47:00] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0780000.pt +[2025-02-21 16:47:08] (step=0780100) Train Loss: 0.1461, Train Steps/Sec: 8.30, Grad Norm: 0.0060 +[2025-02-21 16:47:16] (step=0780200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:47:26] (step=0780300) Train Loss: 0.1458, Train Steps/Sec: 10.32, Grad Norm: 0.0067 +[2025-02-21 16:47:37] (step=0780400) Train Loss: 0.1455, Train Steps/Sec: 9.41, Grad Norm: 0.0061 +[2025-02-21 16:47:46] (step=0780500) Train Loss: 0.1462, Train Steps/Sec: 10.03, Grad Norm: 0.0058 +[2025-02-21 16:47:56] (step=0780600) Train Loss: 0.1447, Train Steps/Sec: 10.26, Grad Norm: 0.0061 +[2025-02-21 16:48:05] (step=0780700) Train Loss: 0.1455, Train Steps/Sec: 11.28, Grad Norm: 0.0057 +[2025-02-21 16:48:13] (step=0780800) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 16:48:21] (step=0780900) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 16:48:30] (step=0781000) Train Loss: 0.1453, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 16:48:37] (step=0781100) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 16:48:45] (step=0781200) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 16:48:53] (step=0781300) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 16:49:01] (step=0781400) Train Loss: 0.1444, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 16:49:09] (step=0781500) Train Loss: 0.1452, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 16:49:17] (step=0781600) Train Loss: 0.1465, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 16:49:25] (step=0781700) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 16:49:33] (step=0781800) Train Loss: 0.1445, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 16:49:42] (step=0781900) Train Loss: 0.1455, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-21 16:49:50] (step=0782000) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 16:49:58] (step=0782100) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 16:50:06] (step=0782200) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 16:50:14] (step=0782300) Train Loss: 0.1464, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 16:50:22] (step=0782400) Train Loss: 0.1477, Train Steps/Sec: 12.10, Grad Norm: 0.0060 +[2025-02-21 16:50:32] (step=0782500) Train Loss: 0.1464, Train Steps/Sec: 9.70, Grad Norm: 0.0061 +[2025-02-21 16:50:43] (step=0782600) Train Loss: 0.1455, Train Steps/Sec: 9.40, Grad Norm: 0.0060 +[2025-02-21 16:50:52] (step=0782700) Train Loss: 0.1482, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 16:51:02] (step=0782800) Train Loss: 0.1466, Train Steps/Sec: 10.57, Grad Norm: 0.0062 +[2025-02-21 16:51:10] (step=0782900) Train Loss: 0.1449, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 16:51:18] (step=0783000) Train Loss: 0.1466, Train Steps/Sec: 12.08, Grad Norm: 0.0059 +[2025-02-21 16:51:26] (step=0783100) Train Loss: 0.1454, Train Steps/Sec: 12.47, Grad Norm: 0.0064 +[2025-02-21 16:51:35] (step=0783200) Train Loss: 0.1452, Train Steps/Sec: 10.84, Grad Norm: 0.0060 +[2025-02-21 16:51:43] (step=0783300) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 16:51:51] (step=0783400) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 16:51:59] (step=0783500) Train Loss: 0.1448, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 16:52:07] (step=0783600) Train Loss: 0.1463, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 16:52:15] (step=0783700) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 16:52:23] (step=0783800) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 16:52:31] (step=0783900) Train Loss: 0.1461, Train Steps/Sec: 12.52, Grad Norm: 0.0067 +[2025-02-21 16:52:39] (step=0784000) Train Loss: 0.1455, Train Steps/Sec: 12.54, Grad Norm: 0.0060 +[2025-02-21 16:52:47] (step=0784100) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 16:52:55] (step=0784200) Train Loss: 0.1458, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 16:53:03] (step=0784300) Train Loss: 0.1444, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 16:53:12] (step=0784400) Train Loss: 0.1452, Train Steps/Sec: 10.86, Grad Norm: 0.0061 +[2025-02-21 16:53:20] (step=0784500) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 16:53:29] (step=0784600) Train Loss: 0.1457, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 16:53:39] (step=0784700) Train Loss: 0.1466, Train Steps/Sec: 9.99, Grad Norm: 0.0062 +[2025-02-21 16:53:50] (step=0784800) Train Loss: 0.1458, Train Steps/Sec: 9.17, Grad Norm: 0.0061 +[2025-02-21 16:53:59] (step=0784900) Train Loss: 0.1457, Train Steps/Sec: 10.28, Grad Norm: 0.0060 +[2025-02-21 16:54:08] (step=0785000) Train Loss: 0.1460, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 16:54:16] (step=0785100) Train Loss: 0.1453, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 16:54:24] (step=0785200) Train Loss: 0.1466, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 16:54:32] (step=0785300) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 16:54:41] (step=0785400) Train Loss: 0.1467, Train Steps/Sec: 12.10, Grad Norm: 0.0059 +[2025-02-21 16:54:49] (step=0785500) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 16:54:57] (step=0785600) Train Loss: 0.1462, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 16:55:05] (step=0785700) Train Loss: 0.1456, Train Steps/Sec: 11.30, Grad Norm: 0.0060 +[2025-02-21 16:55:13] (step=0785800) Train Loss: 0.1452, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 16:55:21] (step=0785900) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 16:55:29] (step=0786000) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:55:37] (step=0786100) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 16:55:45] (step=0786200) Train Loss: 0.1443, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 16:55:53] (step=0786300) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 16:56:01] (step=0786400) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 16:56:09] (step=0786500) Train Loss: 0.1445, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 16:56:17] (step=0786600) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 16:56:26] (step=0786700) Train Loss: 0.1456, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 16:56:34] (step=0786800) Train Loss: 0.1461, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 16:56:46] (step=0786900) Train Loss: 0.1461, Train Steps/Sec: 8.64, Grad Norm: 0.0058 +[2025-02-21 16:56:56] (step=0787000) Train Loss: 0.1451, Train Steps/Sec: 10.24, Grad Norm: 0.0059 +[2025-02-21 16:57:05] (step=0787100) Train Loss: 0.1464, Train Steps/Sec: 10.58, Grad Norm: 0.0060 +[2025-02-21 16:57:14] (step=0787200) Train Loss: 0.1459, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 16:57:22] (step=0787300) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 16:57:30] (step=0787400) Train Loss: 0.1451, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 16:57:38] (step=0787500) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 16:57:46] (step=0787600) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 16:57:54] (step=0787700) Train Loss: 0.1448, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:58:02] (step=0787800) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 16:58:10] (step=0787900) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 16:58:18] (step=0788000) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 16:58:26] (step=0788100) Train Loss: 0.1442, Train Steps/Sec: 12.54, Grad Norm: 0.0063 +[2025-02-21 16:58:35] (step=0788200) Train Loss: 0.1429, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-21 16:58:43] (step=0788300) Train Loss: 0.1474, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 16:58:51] (step=0788400) Train Loss: 0.1452, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 16:58:59] (step=0788500) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 16:59:07] (step=0788600) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 16:59:15] (step=0788700) Train Loss: 0.1461, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 16:59:23] (step=0788800) Train Loss: 0.1450, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 16:59:32] (step=0788900) Train Loss: 0.1476, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 16:59:40] (step=0789000) Train Loss: 0.1450, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 16:59:51] (step=0789100) Train Loss: 0.1472, Train Steps/Sec: 9.20, Grad Norm: 0.0060 +[2025-02-21 17:00:01] (step=0789200) Train Loss: 0.1455, Train Steps/Sec: 10.30, Grad Norm: 0.0060 +[2025-02-21 17:00:10] (step=0789300) Train Loss: 0.1463, Train Steps/Sec: 10.93, Grad Norm: 0.0059 +[2025-02-21 17:00:20] (step=0789400) Train Loss: 0.1452, Train Steps/Sec: 10.26, Grad Norm: 0.0062 +[2025-02-21 17:00:28] (step=0789500) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:00:36] (step=0789600) Train Loss: 0.1460, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 17:00:44] (step=0789700) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:00:52] (step=0789800) Train Loss: 0.1451, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 17:01:00] (step=0789900) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:01:08] (step=0790000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:01:16] (step=0790100) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 17:01:24] (step=0790200) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:01:32] (step=0790300) Train Loss: 0.1482, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 17:01:40] (step=0790400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 17:01:47] (step=0790500) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:01:55] (step=0790600) Train Loss: 0.1475, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 17:02:04] (step=0790700) Train Loss: 0.1457, Train Steps/Sec: 11.30, Grad Norm: 0.0058 +[2025-02-21 17:02:13] (step=0790800) Train Loss: 0.1453, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 17:02:20] (step=0790900) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0066 +[2025-02-21 17:02:29] (step=0791000) Train Loss: 0.1459, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 17:02:38] (step=0791100) Train Loss: 0.1449, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 17:02:47] (step=0791200) Train Loss: 0.1450, Train Steps/Sec: 10.68, Grad Norm: 0.0061 +[2025-02-21 17:02:57] (step=0791300) Train Loss: 0.1462, Train Steps/Sec: 10.28, Grad Norm: 0.0060 +[2025-02-21 17:03:07] (step=0791400) Train Loss: 0.1458, Train Steps/Sec: 10.31, Grad Norm: 0.0061 +[2025-02-21 17:03:15] (step=0791500) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0057 +[2025-02-21 17:03:24] (step=0791600) Train Loss: 0.1460, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 17:03:32] (step=0791700) Train Loss: 0.1455, Train Steps/Sec: 12.15, Grad Norm: 0.0066 +[2025-02-21 17:03:40] (step=0791800) Train Loss: 0.1448, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 17:03:49] (step=0791900) Train Loss: 0.1461, Train Steps/Sec: 11.23, Grad Norm: 0.0059 +[2025-02-21 17:03:58] (step=0792000) Train Loss: 0.1473, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 17:04:05] (step=0792100) Train Loss: 0.1452, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 17:04:13] (step=0792200) Train Loss: 0.1445, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 17:04:21] (step=0792300) Train Loss: 0.1462, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 17:04:29] (step=0792400) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 17:04:37] (step=0792500) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:04:45] (step=0792600) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:04:53] (step=0792700) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:05:01] (step=0792800) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:05:09] (step=0792900) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 17:05:17] (step=0793000) Train Loss: 0.1468, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 17:05:25] (step=0793100) Train Loss: 0.1479, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 17:05:35] (step=0793200) Train Loss: 0.1459, Train Steps/Sec: 9.97, Grad Norm: 0.0057 +[2025-02-21 17:05:43] (step=0793300) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 17:05:53] (step=0793400) Train Loss: 0.1466, Train Steps/Sec: 10.66, Grad Norm: 0.0059 +[2025-02-21 17:06:02] (step=0793500) Train Loss: 0.1460, Train Steps/Sec: 10.29, Grad Norm: 0.0058 +[2025-02-21 17:06:12] (step=0793600) Train Loss: 0.1464, Train Steps/Sec: 10.27, Grad Norm: 0.0065 +[2025-02-21 17:06:21] (step=0793700) Train Loss: 0.1462, Train Steps/Sec: 10.97, Grad Norm: 0.0059 +[2025-02-21 17:06:30] (step=0793800) Train Loss: 0.1465, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 17:06:38] (step=0793900) Train Loss: 0.1455, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 17:06:46] (step=0794000) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 17:06:54] (step=0794100) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 17:07:02] (step=0794200) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 17:07:10] (step=0794300) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 17:07:19] (step=0794400) Train Loss: 0.1454, Train Steps/Sec: 11.22, Grad Norm: 0.0063 +[2025-02-21 17:07:27] (step=0794500) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:07:35] (step=0794600) Train Loss: 0.1448, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 17:07:43] (step=0794700) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:07:51] (step=0794800) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:07:59] (step=0794900) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:08:07] (step=0795000) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:08:15] (step=0795100) Train Loss: 0.1448, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 17:08:23] (step=0795200) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 17:08:32] (step=0795300) Train Loss: 0.1452, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 17:08:40] (step=0795400) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0057 +[2025-02-21 17:08:49] (step=0795500) Train Loss: 0.1452, Train Steps/Sec: 11.80, Grad Norm: 0.0061 +[2025-02-21 17:08:58] (step=0795600) Train Loss: 0.1463, Train Steps/Sec: 11.32, Grad Norm: 0.0062 +[2025-02-21 17:09:08] (step=0795700) Train Loss: 0.1462, Train Steps/Sec: 9.13, Grad Norm: 0.0059 +[2025-02-21 17:09:19] (step=0795800) Train Loss: 0.1464, Train Steps/Sec: 9.93, Grad Norm: 0.0061 +[2025-02-21 17:09:28] (step=0795900) Train Loss: 0.1451, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 17:09:36] (step=0796000) Train Loss: 0.1454, Train Steps/Sec: 12.15, Grad Norm: 0.0064 +[2025-02-21 17:09:44] (step=0796100) Train Loss: 0.1464, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 17:09:52] (step=0796200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:10:00] (step=0796300) Train Loss: 0.1454, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 17:10:08] (step=0796400) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 17:10:16] (step=0796500) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:10:24] (step=0796600) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:10:32] (step=0796700) Train Loss: 0.1435, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:10:40] (step=0796800) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 17:10:49] (step=0796900) Train Loss: 0.1460, Train Steps/Sec: 11.26, Grad Norm: 0.0058 +[2025-02-21 17:10:57] (step=0797000) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 17:11:05] (step=0797100) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:11:13] (step=0797200) Train Loss: 0.1468, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 17:11:21] (step=0797300) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:11:30] (step=0797400) Train Loss: 0.1444, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 17:11:38] (step=0797500) Train Loss: 0.1442, Train Steps/Sec: 11.74, Grad Norm: 0.0064 +[2025-02-21 17:11:47] (step=0797600) Train Loss: 0.1453, Train Steps/Sec: 11.80, Grad Norm: 0.0058 +[2025-02-21 17:11:55] (step=0797700) Train Loss: 0.1474, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 17:12:04] (step=0797800) Train Loss: 0.1454, Train Steps/Sec: 10.88, Grad Norm: 0.0060 +[2025-02-21 17:12:14] (step=0797900) Train Loss: 0.1445, Train Steps/Sec: 9.98, Grad Norm: 0.0058 +[2025-02-21 17:12:23] (step=0798000) Train Loss: 0.1457, Train Steps/Sec: 10.26, Grad Norm: 0.0059 +[2025-02-21 17:12:33] (step=0798100) Train Loss: 0.1452, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-21 17:12:42] (step=0798200) Train Loss: 0.1449, Train Steps/Sec: 10.86, Grad Norm: 0.0060 +[2025-02-21 17:12:50] (step=0798300) Train Loss: 0.1452, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 17:12:58] (step=0798400) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:13:06] (step=0798500) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0057 +[2025-02-21 17:13:14] (step=0798600) Train Loss: 0.1458, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 17:13:22] (step=0798700) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:13:30] (step=0798800) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:13:38] (step=0798900) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 17:13:46] (step=0799000) Train Loss: 0.1454, Train Steps/Sec: 12.55, Grad Norm: 0.0059 +[2025-02-21 17:13:54] (step=0799100) Train Loss: 0.1454, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 17:14:02] (step=0799200) Train Loss: 0.1449, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 17:14:10] (step=0799300) Train Loss: 0.1466, Train Steps/Sec: 12.11, Grad Norm: 0.0059 +[2025-02-21 17:14:19] (step=0799400) Train Loss: 0.1455, Train Steps/Sec: 11.25, Grad Norm: 0.0058 +[2025-02-21 17:14:27] (step=0799500) Train Loss: 0.1462, Train Steps/Sec: 12.10, Grad Norm: 0.0057 +[2025-02-21 17:14:36] (step=0799600) Train Loss: 0.1455, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 17:14:44] (step=0799700) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 17:14:53] (step=0799800) Train Loss: 0.1472, Train Steps/Sec: 11.80, Grad Norm: 0.0059 +[2025-02-21 17:15:01] (step=0799900) Train Loss: 0.1453, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 17:15:10] (step=0800000) Train Loss: 0.1448, Train Steps/Sec: 10.58, Grad Norm: 0.0061 +[2025-02-21 17:15:13] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0800000.pt +[2025-02-21 17:15:23] (step=0800100) Train Loss: 0.1460, Train Steps/Sec: 7.91, Grad Norm: 0.0059 +[2025-02-21 17:15:33] (step=0800200) Train Loss: 0.1464, Train Steps/Sec: 10.00, Grad Norm: 0.0061 +[2025-02-21 17:15:42] (step=0800300) Train Loss: 0.1465, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 17:15:50] (step=0800400) Train Loss: 0.1454, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 17:15:58] (step=0800500) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 17:16:06] (step=0800600) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:16:15] (step=0800700) Train Loss: 0.1456, Train Steps/Sec: 10.89, Grad Norm: 0.0061 +[2025-02-21 17:16:23] (step=0800800) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:16:31] (step=0800900) Train Loss: 0.1446, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 17:16:39] (step=0801000) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:16:47] (step=0801100) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:16:55] (step=0801200) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:17:03] (step=0801300) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 17:17:11] (step=0801400) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 17:17:19] (step=0801500) Train Loss: 0.1455, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 17:17:27] (step=0801600) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 17:17:36] (step=0801700) Train Loss: 0.1471, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 17:17:44] (step=0801800) Train Loss: 0.1457, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 17:17:54] (step=0801900) Train Loss: 0.1464, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-21 17:18:02] (step=0802000) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 17:18:10] (step=0802100) Train Loss: 0.1452, Train Steps/Sec: 12.16, Grad Norm: 0.0061 +[2025-02-21 17:18:19] (step=0802200) Train Loss: 0.1463, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-21 17:18:29] (step=0802300) Train Loss: 0.1444, Train Steps/Sec: 9.86, Grad Norm: 0.0060 +[2025-02-21 17:18:39] (step=0802400) Train Loss: 0.1447, Train Steps/Sec: 10.94, Grad Norm: 0.0058 +[2025-02-21 17:18:48] (step=0802500) Train Loss: 0.1462, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 17:18:56] (step=0802600) Train Loss: 0.1452, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 17:19:04] (step=0802700) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:19:12] (step=0802800) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 17:19:20] (step=0802900) Train Loss: 0.1457, Train Steps/Sec: 12.15, Grad Norm: 0.0064 +[2025-02-21 17:19:28] (step=0803000) Train Loss: 0.1440, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:19:36] (step=0803100) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:19:45] (step=0803200) Train Loss: 0.1463, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-21 17:19:53] (step=0803300) Train Loss: 0.1477, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 17:20:01] (step=0803400) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 17:20:09] (step=0803500) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 17:20:17] (step=0803600) Train Loss: 0.1466, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 17:20:25] (step=0803700) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 17:20:34] (step=0803800) Train Loss: 0.1468, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 17:20:42] (step=0803900) Train Loss: 0.1455, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 17:20:51] (step=0804000) Train Loss: 0.1459, Train Steps/Sec: 11.80, Grad Norm: 0.0059 +[2025-02-21 17:20:58] (step=0804100) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 17:21:06] (step=0804200) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 17:21:15] (step=0804300) Train Loss: 0.1446, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 17:21:25] (step=0804400) Train Loss: 0.1441, Train Steps/Sec: 9.33, Grad Norm: 0.0058 +[2025-02-21 17:21:35] (step=0804500) Train Loss: 0.1452, Train Steps/Sec: 9.94, Grad Norm: 0.0059 +[2025-02-21 17:21:45] (step=0804600) Train Loss: 0.1460, Train Steps/Sec: 10.26, Grad Norm: 0.0059 +[2025-02-21 17:21:54] (step=0804700) Train Loss: 0.1459, Train Steps/Sec: 11.74, Grad Norm: 0.0063 +[2025-02-21 17:22:02] (step=0804800) Train Loss: 0.1470, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 17:22:10] (step=0804900) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 17:22:18] (step=0805000) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:22:26] (step=0805100) Train Loss: 0.1452, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 17:22:34] (step=0805200) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:22:42] (step=0805300) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:22:50] (step=0805400) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 17:22:58] (step=0805500) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 17:23:06] (step=0805600) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:23:15] (step=0805700) Train Loss: 0.1470, Train Steps/Sec: 10.87, Grad Norm: 0.0061 +[2025-02-21 17:23:23] (step=0805800) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:23:31] (step=0805900) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 17:23:40] (step=0806000) Train Loss: 0.1463, Train Steps/Sec: 10.67, Grad Norm: 0.0059 +[2025-02-21 17:23:48] (step=0806100) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 17:23:57] (step=0806200) Train Loss: 0.1463, Train Steps/Sec: 11.81, Grad Norm: 0.0058 +[2025-02-21 17:24:04] (step=0806300) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 17:24:12] (step=0806400) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:24:21] (step=0806500) Train Loss: 0.1457, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 17:24:31] (step=0806600) Train Loss: 0.1460, Train Steps/Sec: 9.95, Grad Norm: 0.0058 +[2025-02-21 17:24:40] (step=0806700) Train Loss: 0.1460, Train Steps/Sec: 10.61, Grad Norm: 0.0058 +[2025-02-21 17:24:50] (step=0806800) Train Loss: 0.1441, Train Steps/Sec: 9.98, Grad Norm: 0.0061 +[2025-02-21 17:25:00] (step=0806900) Train Loss: 0.1469, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-21 17:25:08] (step=0807000) Train Loss: 0.1471, Train Steps/Sec: 12.14, Grad Norm: 0.0064 +[2025-02-21 17:25:16] (step=0807100) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:25:24] (step=0807200) Train Loss: 0.1474, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 17:25:32] (step=0807300) Train Loss: 0.1445, Train Steps/Sec: 12.13, Grad Norm: 0.0065 +[2025-02-21 17:25:40] (step=0807400) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 17:25:48] (step=0807500) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 17:25:56] (step=0807600) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 17:26:04] (step=0807700) Train Loss: 0.1470, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 17:26:12] (step=0807800) Train Loss: 0.1444, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:26:20] (step=0807900) Train Loss: 0.1447, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 17:26:28] (step=0808000) Train Loss: 0.1449, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 17:26:37] (step=0808100) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 17:26:46] (step=0808200) Train Loss: 0.1458, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-21 17:26:55] (step=0808300) Train Loss: 0.1466, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 17:27:03] (step=0808400) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:27:10] (step=0808500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 17:27:18] (step=0808600) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 17:27:27] (step=0808700) Train Loss: 0.1469, Train Steps/Sec: 11.31, Grad Norm: 0.0057 +[2025-02-21 17:27:37] (step=0808800) Train Loss: 0.1460, Train Steps/Sec: 10.59, Grad Norm: 0.0062 +[2025-02-21 17:27:46] (step=0808900) Train Loss: 0.1451, Train Steps/Sec: 10.27, Grad Norm: 0.0061 +[2025-02-21 17:27:57] (step=0809000) Train Loss: 0.1455, Train Steps/Sec: 9.95, Grad Norm: 0.0062 +[2025-02-21 17:28:05] (step=0809100) Train Loss: 0.1453, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 17:28:13] (step=0809200) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:28:21] (step=0809300) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 17:28:30] (step=0809400) Train Loss: 0.1455, Train Steps/Sec: 11.16, Grad Norm: 0.0063 +[2025-02-21 17:28:38] (step=0809500) Train Loss: 0.1450, Train Steps/Sec: 12.09, Grad Norm: 0.0059 +[2025-02-21 17:28:46] (step=0809600) Train Loss: 0.1448, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 17:28:54] (step=0809700) Train Loss: 0.1460, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 17:29:02] (step=0809800) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 17:29:10] (step=0809900) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:29:18] (step=0810000) Train Loss: 0.1469, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 17:29:26] (step=0810100) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:29:34] (step=0810200) Train Loss: 0.1469, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 17:29:43] (step=0810300) Train Loss: 0.1458, Train Steps/Sec: 11.04, Grad Norm: 0.0061 +[2025-02-21 17:29:52] (step=0810400) Train Loss: 0.1460, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 17:30:00] (step=0810500) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:30:08] (step=0810600) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 17:30:17] (step=0810700) Train Loss: 0.1465, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-21 17:30:25] (step=0810800) Train Loss: 0.1450, Train Steps/Sec: 12.53, Grad Norm: 0.0058 +[2025-02-21 17:30:34] (step=0810900) Train Loss: 0.1468, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 17:30:43] (step=0811000) Train Loss: 0.1453, Train Steps/Sec: 10.19, Grad Norm: 0.0058 +[2025-02-21 17:30:53] (step=0811100) Train Loss: 0.1460, Train Steps/Sec: 9.95, Grad Norm: 0.0059 +[2025-02-21 17:31:03] (step=0811200) Train Loss: 0.1466, Train Steps/Sec: 10.63, Grad Norm: 0.0063 +[2025-02-21 17:31:11] (step=0811300) Train Loss: 0.1462, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 17:31:19] (step=0811400) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 17:31:27] (step=0811500) Train Loss: 0.1468, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 17:31:35] (step=0811600) Train Loss: 0.1473, Train Steps/Sec: 12.56, Grad Norm: 0.0066 +[2025-02-21 17:31:44] (step=0811700) Train Loss: 0.1452, Train Steps/Sec: 12.07, Grad Norm: 0.0060 +[2025-02-21 17:31:52] (step=0811800) Train Loss: 0.1458, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 17:32:00] (step=0811900) Train Loss: 0.1453, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-21 17:32:08] (step=0812000) Train Loss: 0.1455, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 17:32:17] (step=0812100) Train Loss: 0.1464, Train Steps/Sec: 12.13, Grad Norm: 0.0057 +[2025-02-21 17:32:25] (step=0812200) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 17:32:33] (step=0812300) Train Loss: 0.1453, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 17:32:42] (step=0812400) Train Loss: 0.1443, Train Steps/Sec: 10.67, Grad Norm: 0.0063 +[2025-02-21 17:32:50] (step=0812500) Train Loss: 0.1459, Train Steps/Sec: 12.11, Grad Norm: 0.0058 +[2025-02-21 17:32:58] (step=0812600) Train Loss: 0.1456, Train Steps/Sec: 12.15, Grad Norm: 0.0057 +[2025-02-21 17:33:06] (step=0812700) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 17:33:14] (step=0812800) Train Loss: 0.1449, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:33:22] (step=0812900) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 17:33:30] (step=0813000) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 17:33:39] (step=0813100) Train Loss: 0.1465, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 17:33:50] (step=0813200) Train Loss: 0.1470, Train Steps/Sec: 9.07, Grad Norm: 0.0060 +[2025-02-21 17:34:00] (step=0813300) Train Loss: 0.1460, Train Steps/Sec: 10.57, Grad Norm: 0.0059 +[2025-02-21 17:34:09] (step=0813400) Train Loss: 0.1455, Train Steps/Sec: 10.62, Grad Norm: 0.0057 +[2025-02-21 17:34:18] (step=0813500) Train Loss: 0.1449, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 17:34:25] (step=0813600) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 17:34:33] (step=0813700) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:34:41] (step=0813800) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:34:50] (step=0813900) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 17:34:58] (step=0814000) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 17:35:05] (step=0814100) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:35:13] (step=0814200) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:35:22] (step=0814300) Train Loss: 0.1469, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 17:35:30] (step=0814400) Train Loss: 0.1456, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 17:35:39] (step=0814500) Train Loss: 0.1450, Train Steps/Sec: 10.24, Grad Norm: 0.0057 +[2025-02-21 17:35:48] (step=0814600) Train Loss: 0.1462, Train Steps/Sec: 11.73, Grad Norm: 0.0057 +[2025-02-21 17:35:56] (step=0814700) Train Loss: 0.1465, Train Steps/Sec: 11.78, Grad Norm: 0.0057 +[2025-02-21 17:36:04] (step=0814800) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 17:36:12] (step=0814900) Train Loss: 0.1472, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:36:20] (step=0815000) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 17:36:28] (step=0815100) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:36:36] (step=0815200) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:36:45] (step=0815300) Train Loss: 0.1465, Train Steps/Sec: 10.60, Grad Norm: 0.0058 +[2025-02-21 17:36:55] (step=0815400) Train Loss: 0.1453, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-21 17:37:05] (step=0815500) Train Loss: 0.1456, Train Steps/Sec: 9.42, Grad Norm: 0.0060 +[2025-02-21 17:37:14] (step=0815600) Train Loss: 0.1460, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 17:37:23] (step=0815700) Train Loss: 0.1457, Train Steps/Sec: 10.52, Grad Norm: 0.0062 +[2025-02-21 17:37:31] (step=0815800) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 17:37:39] (step=0815900) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:37:47] (step=0816000) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 17:37:55] (step=0816100) Train Loss: 0.1459, Train Steps/Sec: 12.09, Grad Norm: 0.0063 +[2025-02-21 17:38:03] (step=0816200) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:38:11] (step=0816300) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 17:38:19] (step=0816400) Train Loss: 0.1454, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 17:38:27] (step=0816500) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:38:36] (step=0816600) Train Loss: 0.1456, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 17:38:45] (step=0816700) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 17:38:53] (step=0816800) Train Loss: 0.1461, Train Steps/Sec: 11.79, Grad Norm: 0.0061 +[2025-02-21 17:39:01] (step=0816900) Train Loss: 0.1448, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 17:39:10] (step=0817000) Train Loss: 0.1453, Train Steps/Sec: 11.20, Grad Norm: 0.0062 +[2025-02-21 17:39:18] (step=0817100) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 17:39:26] (step=0817200) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:39:34] (step=0817300) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:39:42] (step=0817400) Train Loss: 0.1446, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:39:52] (step=0817500) Train Loss: 0.1457, Train Steps/Sec: 9.95, Grad Norm: 0.0059 +[2025-02-21 17:40:01] (step=0817600) Train Loss: 0.1457, Train Steps/Sec: 11.01, Grad Norm: 0.0060 +[2025-02-21 17:40:11] (step=0817700) Train Loss: 0.1456, Train Steps/Sec: 10.00, Grad Norm: 0.0062 +[2025-02-21 17:40:20] (step=0817800) Train Loss: 0.1452, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 17:40:28] (step=0817900) Train Loss: 0.1472, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 17:40:36] (step=0818000) Train Loss: 0.1475, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:40:44] (step=0818100) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 17:40:53] (step=0818200) Train Loss: 0.1457, Train Steps/Sec: 11.22, Grad Norm: 0.0062 +[2025-02-21 17:41:01] (step=0818300) Train Loss: 0.1476, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 17:41:09] (step=0818400) Train Loss: 0.1465, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 17:41:17] (step=0818500) Train Loss: 0.1459, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 17:41:25] (step=0818600) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:41:33] (step=0818700) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 17:41:42] (step=0818800) Train Loss: 0.1451, Train Steps/Sec: 10.67, Grad Norm: 0.0058 +[2025-02-21 17:41:50] (step=0818900) Train Loss: 0.1445, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:41:59] (step=0819000) Train Loss: 0.1441, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 17:42:07] (step=0819100) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:42:15] (step=0819200) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 17:42:23] (step=0819300) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 17:42:31] (step=0819400) Train Loss: 0.1457, Train Steps/Sec: 12.53, Grad Norm: 0.0060 +[2025-02-21 17:42:40] (step=0819500) Train Loss: 0.1465, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 17:42:48] (step=0819600) Train Loss: 0.1459, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 17:42:58] (step=0819700) Train Loss: 0.1459, Train Steps/Sec: 10.24, Grad Norm: 0.0063 +[2025-02-21 17:43:07] (step=0819800) Train Loss: 0.1456, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 17:43:17] (step=0819900) Train Loss: 0.1455, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-21 17:43:26] (step=0820000) Train Loss: 0.1459, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 17:43:30] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0820000.pt +[2025-02-21 17:43:38] (step=0820100) Train Loss: 0.1468, Train Steps/Sec: 8.25, Grad Norm: 0.0062 +[2025-02-21 17:43:46] (step=0820200) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 17:43:54] (step=0820300) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:44:02] (step=0820400) Train Loss: 0.1457, Train Steps/Sec: 12.10, Grad Norm: 0.0061 +[2025-02-21 17:44:10] (step=0820500) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:44:18] (step=0820600) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 17:44:27] (step=0820700) Train Loss: 0.1453, Train Steps/Sec: 10.90, Grad Norm: 0.0058 +[2025-02-21 17:44:35] (step=0820800) Train Loss: 0.1449, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 17:44:44] (step=0820900) Train Loss: 0.1463, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 17:44:52] (step=0821000) Train Loss: 0.1455, Train Steps/Sec: 11.35, Grad Norm: 0.0062 +[2025-02-21 17:45:01] (step=0821100) Train Loss: 0.1459, Train Steps/Sec: 11.81, Grad Norm: 0.0065 +[2025-02-21 17:45:09] (step=0821200) Train Loss: 0.1475, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 17:45:17] (step=0821300) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:45:25] (step=0821400) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:45:33] (step=0821500) Train Loss: 0.1482, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:45:41] (step=0821600) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 17:45:48] (step=0821700) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:45:57] (step=0821800) Train Loss: 0.1464, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 17:46:06] (step=0821900) Train Loss: 0.1468, Train Steps/Sec: 10.19, Grad Norm: 0.0063 +[2025-02-21 17:46:17] (step=0822000) Train Loss: 0.1459, Train Steps/Sec: 9.60, Grad Norm: 0.0060 +[2025-02-21 17:46:27] (step=0822100) Train Loss: 0.1456, Train Steps/Sec: 10.26, Grad Norm: 0.0057 +[2025-02-21 17:46:35] (step=0822200) Train Loss: 0.1476, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 17:46:43] (step=0822300) Train Loss: 0.1439, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 17:46:51] (step=0822400) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 17:46:59] (step=0822500) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:47:07] (step=0822600) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 17:47:15] (step=0822700) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 17:47:24] (step=0822800) Train Loss: 0.1455, Train Steps/Sec: 12.20, Grad Norm: 0.0065 +[2025-02-21 17:47:32] (step=0822900) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 17:47:40] (step=0823000) Train Loss: 0.1464, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 17:47:49] (step=0823100) Train Loss: 0.1451, Train Steps/Sec: 11.04, Grad Norm: 0.0061 +[2025-02-21 17:47:58] (step=0823200) Train Loss: 0.1464, Train Steps/Sec: 10.57, Grad Norm: 0.0061 +[2025-02-21 17:48:06] (step=0823300) Train Loss: 0.1467, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 17:48:14] (step=0823400) Train Loss: 0.1473, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 17:48:22] (step=0823500) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:48:30] (step=0823600) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 17:48:38] (step=0823700) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 17:48:46] (step=0823800) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 17:48:54] (step=0823900) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0056 +[2025-02-21 17:49:02] (step=0824000) Train Loss: 0.1461, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 17:49:12] (step=0824100) Train Loss: 0.1457, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 17:49:21] (step=0824200) Train Loss: 0.1455, Train Steps/Sec: 10.26, Grad Norm: 0.0057 +[2025-02-21 17:49:30] (step=0824300) Train Loss: 0.1458, Train Steps/Sec: 10.97, Grad Norm: 0.0060 +[2025-02-21 17:49:40] (step=0824400) Train Loss: 0.1463, Train Steps/Sec: 10.96, Grad Norm: 0.0059 +[2025-02-21 17:49:48] (step=0824500) Train Loss: 0.1452, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-21 17:49:56] (step=0824600) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:50:04] (step=0824700) Train Loss: 0.1457, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 17:50:12] (step=0824800) Train Loss: 0.1472, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 17:50:20] (step=0824900) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 17:50:29] (step=0825000) Train Loss: 0.1452, Train Steps/Sec: 12.22, Grad Norm: 0.0060 +[2025-02-21 17:50:36] (step=0825100) Train Loss: 0.1451, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 17:50:46] (step=0825200) Train Loss: 0.1474, Train Steps/Sec: 11.04, Grad Norm: 0.0063 +[2025-02-21 17:50:54] (step=0825300) Train Loss: 0.1461, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 17:51:02] (step=0825400) Train Loss: 0.1465, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 17:51:10] (step=0825500) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 17:51:18] (step=0825600) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0056 +[2025-02-21 17:51:27] (step=0825700) Train Loss: 0.1456, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-21 17:51:35] (step=0825800) Train Loss: 0.1471, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 17:51:43] (step=0825900) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 17:51:51] (step=0826000) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 17:51:59] (step=0826100) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 17:52:08] (step=0826200) Train Loss: 0.1461, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 17:52:17] (step=0826300) Train Loss: 0.1450, Train Steps/Sec: 10.56, Grad Norm: 0.0060 +[2025-02-21 17:52:27] (step=0826400) Train Loss: 0.1472, Train Steps/Sec: 9.97, Grad Norm: 0.0059 +[2025-02-21 17:52:36] (step=0826500) Train Loss: 0.1460, Train Steps/Sec: 10.99, Grad Norm: 0.0059 +[2025-02-21 17:52:45] (step=0826600) Train Loss: 0.1463, Train Steps/Sec: 11.32, Grad Norm: 0.0062 +[2025-02-21 17:52:53] (step=0826700) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 17:53:01] (step=0826800) Train Loss: 0.1453, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 17:53:09] (step=0826900) Train Loss: 0.1453, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 17:53:18] (step=0827000) Train Loss: 0.1450, Train Steps/Sec: 10.88, Grad Norm: 0.0059 +[2025-02-21 17:53:26] (step=0827100) Train Loss: 0.1467, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 17:53:34] (step=0827200) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 17:53:43] (step=0827300) Train Loss: 0.1460, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 17:53:52] (step=0827400) Train Loss: 0.1462, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 17:54:00] (step=0827500) Train Loss: 0.1454, Train Steps/Sec: 11.80, Grad Norm: 0.0059 +[2025-02-21 17:54:08] (step=0827600) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 17:54:16] (step=0827700) Train Loss: 0.1451, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 17:54:24] (step=0827800) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 17:54:32] (step=0827900) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 17:54:40] (step=0828000) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:54:48] (step=0828100) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:54:56] (step=0828200) Train Loss: 0.1456, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 17:55:04] (step=0828300) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 17:55:14] (step=0828400) Train Loss: 0.1456, Train Steps/Sec: 10.61, Grad Norm: 0.0057 +[2025-02-21 17:55:23] (step=0828500) Train Loss: 0.1468, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-21 17:55:33] (step=0828600) Train Loss: 0.1455, Train Steps/Sec: 9.99, Grad Norm: 0.0060 +[2025-02-21 17:55:42] (step=0828700) Train Loss: 0.1447, Train Steps/Sec: 10.99, Grad Norm: 0.0064 +[2025-02-21 17:55:51] (step=0828800) Train Loss: 0.1463, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 17:55:59] (step=0828900) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 17:56:06] (step=0829000) Train Loss: 0.1478, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 17:56:14] (step=0829100) Train Loss: 0.1454, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 17:56:23] (step=0829200) Train Loss: 0.1464, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 17:56:31] (step=0829300) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:56:39] (step=0829400) Train Loss: 0.1464, Train Steps/Sec: 12.54, Grad Norm: 0.0063 +[2025-02-21 17:56:49] (step=0829500) Train Loss: 0.1452, Train Steps/Sec: 9.67, Grad Norm: 0.0058 +[2025-02-21 17:56:57] (step=0829600) Train Loss: 0.1453, Train Steps/Sec: 12.19, Grad Norm: 0.0057 +[2025-02-21 17:57:06] (step=0829700) Train Loss: 0.1469, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 17:57:13] (step=0829800) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 17:57:21] (step=0829900) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 17:57:29] (step=0830000) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 17:57:37] (step=0830100) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:57:45] (step=0830200) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 17:57:53] (step=0830300) Train Loss: 0.1440, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 17:58:01] (step=0830400) Train Loss: 0.1456, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 17:58:09] (step=0830500) Train Loss: 0.1449, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 17:58:19] (step=0830600) Train Loss: 0.1450, Train Steps/Sec: 10.52, Grad Norm: 0.0061 +[2025-02-21 17:58:29] (step=0830700) Train Loss: 0.1463, Train Steps/Sec: 10.14, Grad Norm: 0.0062 +[2025-02-21 17:58:39] (step=0830800) Train Loss: 0.1469, Train Steps/Sec: 9.66, Grad Norm: 0.0061 +[2025-02-21 17:58:48] (step=0830900) Train Loss: 0.1455, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 17:58:56] (step=0831000) Train Loss: 0.1454, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 17:59:04] (step=0831100) Train Loss: 0.1457, Train Steps/Sec: 12.54, Grad Norm: 0.0060 +[2025-02-21 17:59:12] (step=0831200) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 17:59:20] (step=0831300) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 17:59:29] (step=0831400) Train Loss: 0.1452, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 17:59:36] (step=0831500) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 17:59:45] (step=0831600) Train Loss: 0.1447, Train Steps/Sec: 11.35, Grad Norm: 0.0065 +[2025-02-21 17:59:54] (step=0831700) Train Loss: 0.1461, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 18:00:02] (step=0831800) Train Loss: 0.1456, Train Steps/Sec: 11.80, Grad Norm: 0.0058 +[2025-02-21 18:00:10] (step=0831900) Train Loss: 0.1460, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 18:00:19] (step=0832000) Train Loss: 0.1465, Train Steps/Sec: 11.26, Grad Norm: 0.0063 +[2025-02-21 18:00:27] (step=0832100) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 18:00:35] (step=0832200) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:00:43] (step=0832300) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 18:00:51] (step=0832400) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 18:00:59] (step=0832500) Train Loss: 0.1461, Train Steps/Sec: 12.65, Grad Norm: 0.0062 +[2025-02-21 18:01:07] (step=0832600) Train Loss: 0.1470, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 18:01:15] (step=0832700) Train Loss: 0.1454, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 18:01:24] (step=0832800) Train Loss: 0.1439, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 18:01:33] (step=0832900) Train Loss: 0.1462, Train Steps/Sec: 10.90, Grad Norm: 0.0063 +[2025-02-21 18:01:43] (step=0833000) Train Loss: 0.1454, Train Steps/Sec: 9.97, Grad Norm: 0.0065 +[2025-02-21 18:01:52] (step=0833100) Train Loss: 0.1444, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 18:02:01] (step=0833200) Train Loss: 0.1441, Train Steps/Sec: 10.52, Grad Norm: 0.0059 +[2025-02-21 18:02:09] (step=0833300) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 18:02:17] (step=0833400) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:02:26] (step=0833500) Train Loss: 0.1459, Train Steps/Sec: 12.23, Grad Norm: 0.0060 +[2025-02-21 18:02:34] (step=0833600) Train Loss: 0.1443, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 18:02:42] (step=0833700) Train Loss: 0.1452, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 18:02:51] (step=0833800) Train Loss: 0.1447, Train Steps/Sec: 11.42, Grad Norm: 0.0058 +[2025-02-21 18:02:59] (step=0833900) Train Loss: 0.1451, Train Steps/Sec: 11.85, Grad Norm: 0.0062 +[2025-02-21 18:03:07] (step=0834000) Train Loss: 0.1444, Train Steps/Sec: 12.65, Grad Norm: 0.0059 +[2025-02-21 18:03:15] (step=0834100) Train Loss: 0.1470, Train Steps/Sec: 12.65, Grad Norm: 0.0057 +[2025-02-21 18:03:23] (step=0834200) Train Loss: 0.1449, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 18:03:31] (step=0834300) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:03:39] (step=0834400) Train Loss: 0.1464, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 18:03:48] (step=0834500) Train Loss: 0.1462, Train Steps/Sec: 11.34, Grad Norm: 0.0063 +[2025-02-21 18:03:56] (step=0834600) Train Loss: 0.1465, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 18:04:04] (step=0834700) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 18:04:12] (step=0834800) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 18:04:20] (step=0834900) Train Loss: 0.1463, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 18:04:29] (step=0835000) Train Loss: 0.1450, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-21 18:04:39] (step=0835100) Train Loss: 0.1462, Train Steps/Sec: 10.56, Grad Norm: 0.0059 +[2025-02-21 18:04:49] (step=0835200) Train Loss: 0.1451, Train Steps/Sec: 10.00, Grad Norm: 0.0059 +[2025-02-21 18:04:57] (step=0835300) Train Loss: 0.1459, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 18:05:06] (step=0835400) Train Loss: 0.1461, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-21 18:05:14] (step=0835500) Train Loss: 0.1442, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 18:05:22] (step=0835600) Train Loss: 0.1473, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:05:31] (step=0835700) Train Loss: 0.1450, Train Steps/Sec: 10.91, Grad Norm: 0.0057 +[2025-02-21 18:05:39] (step=0835800) Train Loss: 0.1464, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 18:05:49] (step=0835900) Train Loss: 0.1462, Train Steps/Sec: 10.69, Grad Norm: 0.0060 +[2025-02-21 18:05:57] (step=0836000) Train Loss: 0.1455, Train Steps/Sec: 12.20, Grad Norm: 0.0063 +[2025-02-21 18:06:05] (step=0836100) Train Loss: 0.1467, Train Steps/Sec: 12.21, Grad Norm: 0.0062 +[2025-02-21 18:06:13] (step=0836200) Train Loss: 0.1449, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:06:21] (step=0836300) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:06:29] (step=0836400) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 18:06:37] (step=0836500) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 18:06:45] (step=0836600) Train Loss: 0.1462, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 18:06:52] (step=0836700) Train Loss: 0.1453, Train Steps/Sec: 12.66, Grad Norm: 0.0060 +[2025-02-21 18:07:00] (step=0836800) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:07:08] (step=0836900) Train Loss: 0.1454, Train Steps/Sec: 12.51, Grad Norm: 0.0060 +[2025-02-21 18:07:17] (step=0837000) Train Loss: 0.1460, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-21 18:07:26] (step=0837100) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0063 +[2025-02-21 18:07:35] (step=0837200) Train Loss: 0.1466, Train Steps/Sec: 10.93, Grad Norm: 0.0058 +[2025-02-21 18:07:45] (step=0837300) Train Loss: 0.1467, Train Steps/Sec: 10.28, Grad Norm: 0.0060 +[2025-02-21 18:07:55] (step=0837400) Train Loss: 0.1450, Train Steps/Sec: 9.69, Grad Norm: 0.0060 +[2025-02-21 18:08:03] (step=0837500) Train Loss: 0.1460, Train Steps/Sec: 12.16, Grad Norm: 0.0058 +[2025-02-21 18:08:12] (step=0837600) Train Loss: 0.1463, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 18:08:20] (step=0837700) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 18:08:28] (step=0837800) Train Loss: 0.1452, Train Steps/Sec: 12.21, Grad Norm: 0.0057 +[2025-02-21 18:08:36] (step=0837900) Train Loss: 0.1439, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 18:08:45] (step=0838000) Train Loss: 0.1460, Train Steps/Sec: 11.01, Grad Norm: 0.0059 +[2025-02-21 18:08:53] (step=0838100) Train Loss: 0.1458, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 18:09:03] (step=0838200) Train Loss: 0.1451, Train Steps/Sec: 10.54, Grad Norm: 0.0062 +[2025-02-21 18:09:11] (step=0838300) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:09:19] (step=0838400) Train Loss: 0.1460, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 18:09:27] (step=0838500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:09:35] (step=0838600) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 18:09:42] (step=0838700) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 18:09:50] (step=0838800) Train Loss: 0.1474, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:09:58] (step=0838900) Train Loss: 0.1446, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:10:06] (step=0839000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:10:14] (step=0839100) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 18:10:22] (step=0839200) Train Loss: 0.1457, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 18:10:31] (step=0839300) Train Loss: 0.1454, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 18:10:40] (step=0839400) Train Loss: 0.1447, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-21 18:10:51] (step=0839500) Train Loss: 0.1462, Train Steps/Sec: 9.11, Grad Norm: 0.0058 +[2025-02-21 18:11:01] (step=0839600) Train Loss: 0.1453, Train Steps/Sec: 10.00, Grad Norm: 0.0058 +[2025-02-21 18:11:09] (step=0839700) Train Loss: 0.1457, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 18:11:18] (step=0839800) Train Loss: 0.1453, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 18:11:26] (step=0839900) Train Loss: 0.1461, Train Steps/Sec: 12.22, Grad Norm: 0.0062 +[2025-02-21 18:11:34] (step=0840000) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 18:11:37] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0840000.pt +[2025-02-21 18:11:46] (step=0840100) Train Loss: 0.1459, Train Steps/Sec: 7.80, Grad Norm: 0.0057 +[2025-02-21 18:11:55] (step=0840200) Train Loss: 0.1466, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 18:12:03] (step=0840300) Train Loss: 0.1455, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 18:12:11] (step=0840400) Train Loss: 0.1452, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:12:19] (step=0840500) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:12:27] (step=0840600) Train Loss: 0.1475, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 18:12:36] (step=0840700) Train Loss: 0.1455, Train Steps/Sec: 11.19, Grad Norm: 0.0061 +[2025-02-21 18:12:44] (step=0840800) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 18:12:52] (step=0840900) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0067 +[2025-02-21 18:13:00] (step=0841000) Train Loss: 0.1463, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 18:13:08] (step=0841100) Train Loss: 0.1452, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 18:13:16] (step=0841200) Train Loss: 0.1473, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 18:13:24] (step=0841300) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 18:13:32] (step=0841400) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 18:13:41] (step=0841500) Train Loss: 0.1466, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 18:13:50] (step=0841600) Train Loss: 0.1455, Train Steps/Sec: 10.56, Grad Norm: 0.0058 +[2025-02-21 18:14:00] (step=0841700) Train Loss: 0.1453, Train Steps/Sec: 10.24, Grad Norm: 0.0064 +[2025-02-21 18:14:10] (step=0841800) Train Loss: 0.1453, Train Steps/Sec: 10.23, Grad Norm: 0.0060 +[2025-02-21 18:14:18] (step=0841900) Train Loss: 0.1465, Train Steps/Sec: 12.04, Grad Norm: 0.0060 +[2025-02-21 18:14:28] (step=0842000) Train Loss: 0.1460, Train Steps/Sec: 10.21, Grad Norm: 0.0060 +[2025-02-21 18:14:36] (step=0842100) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 18:14:44] (step=0842200) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 18:14:53] (step=0842300) Train Loss: 0.1456, Train Steps/Sec: 10.35, Grad Norm: 0.0062 +[2025-02-21 18:15:01] (step=0842400) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:15:10] (step=0842500) Train Loss: 0.1464, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 18:15:18] (step=0842600) Train Loss: 0.1447, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 18:15:26] (step=0842700) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 18:15:34] (step=0842800) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:15:41] (step=0842900) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 18:15:49] (step=0843000) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 18:15:57] (step=0843100) Train Loss: 0.1449, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:16:06] (step=0843200) Train Loss: 0.1475, Train Steps/Sec: 11.17, Grad Norm: 0.0062 +[2025-02-21 18:16:14] (step=0843300) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 18:16:22] (step=0843400) Train Loss: 0.1455, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 18:16:30] (step=0843500) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 18:16:38] (step=0843600) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 18:16:47] (step=0843700) Train Loss: 0.1453, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 18:16:56] (step=0843800) Train Loss: 0.1457, Train Steps/Sec: 10.90, Grad Norm: 0.0058 +[2025-02-21 18:17:06] (step=0843900) Train Loss: 0.1454, Train Steps/Sec: 10.27, Grad Norm: 0.0059 +[2025-02-21 18:17:16] (step=0844000) Train Loss: 0.1448, Train Steps/Sec: 10.53, Grad Norm: 0.0058 +[2025-02-21 18:17:24] (step=0844100) Train Loss: 0.1461, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 18:17:33] (step=0844200) Train Loss: 0.1462, Train Steps/Sec: 11.69, Grad Norm: 0.0065 +[2025-02-21 18:17:41] (step=0844300) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 18:17:50] (step=0844400) Train Loss: 0.1453, Train Steps/Sec: 11.30, Grad Norm: 0.0059 +[2025-02-21 18:17:59] (step=0844500) Train Loss: 0.1467, Train Steps/Sec: 10.18, Grad Norm: 0.0057 +[2025-02-21 18:18:08] (step=0844600) Train Loss: 0.1449, Train Steps/Sec: 11.75, Grad Norm: 0.0057 +[2025-02-21 18:18:16] (step=0844700) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 18:18:24] (step=0844800) Train Loss: 0.1452, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 18:18:32] (step=0844900) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 18:18:40] (step=0845000) Train Loss: 0.1458, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 18:18:48] (step=0845100) Train Loss: 0.1453, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 18:18:56] (step=0845200) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 18:19:04] (step=0845300) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 18:19:12] (step=0845400) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 18:19:20] (step=0845500) Train Loss: 0.1466, Train Steps/Sec: 12.51, Grad Norm: 0.0059 +[2025-02-21 18:19:28] (step=0845600) Train Loss: 0.1469, Train Steps/Sec: 12.50, Grad Norm: 0.0060 +[2025-02-21 18:19:36] (step=0845700) Train Loss: 0.1450, Train Steps/Sec: 11.25, Grad Norm: 0.0059 +[2025-02-21 18:19:45] (step=0845800) Train Loss: 0.1460, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 18:19:54] (step=0845900) Train Loss: 0.1465, Train Steps/Sec: 11.27, Grad Norm: 0.0061 +[2025-02-21 18:20:03] (step=0846000) Train Loss: 0.1467, Train Steps/Sec: 10.23, Grad Norm: 0.0061 +[2025-02-21 18:20:12] (step=0846100) Train Loss: 0.1457, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 18:20:22] (step=0846200) Train Loss: 0.1458, Train Steps/Sec: 10.58, Grad Norm: 0.0062 +[2025-02-21 18:20:31] (step=0846300) Train Loss: 0.1463, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 18:20:39] (step=0846400) Train Loss: 0.1485, Train Steps/Sec: 12.13, Grad Norm: 0.0058 +[2025-02-21 18:20:47] (step=0846500) Train Loss: 0.1461, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 18:20:56] (step=0846600) Train Loss: 0.1456, Train Steps/Sec: 11.07, Grad Norm: 0.0064 +[2025-02-21 18:21:05] (step=0846700) Train Loss: 0.1469, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 18:21:13] (step=0846800) Train Loss: 0.1468, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 18:21:21] (step=0846900) Train Loss: 0.1466, Train Steps/Sec: 12.56, Grad Norm: 0.0066 +[2025-02-21 18:21:30] (step=0847000) Train Loss: 0.1460, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-21 18:21:38] (step=0847100) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 18:21:46] (step=0847200) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 18:21:54] (step=0847300) Train Loss: 0.1472, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 18:22:02] (step=0847400) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0056 +[2025-02-21 18:22:09] (step=0847500) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 18:22:17] (step=0847600) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:22:25] (step=0847700) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:22:33] (step=0847800) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:22:41] (step=0847900) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 18:22:49] (step=0848000) Train Loss: 0.1463, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 18:22:59] (step=0848100) Train Loss: 0.1450, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 18:23:09] (step=0848200) Train Loss: 0.1457, Train Steps/Sec: 9.64, Grad Norm: 0.0061 +[2025-02-21 18:23:19] (step=0848300) Train Loss: 0.1459, Train Steps/Sec: 10.28, Grad Norm: 0.0059 +[2025-02-21 18:23:28] (step=0848400) Train Loss: 0.1458, Train Steps/Sec: 10.65, Grad Norm: 0.0060 +[2025-02-21 18:23:37] (step=0848500) Train Loss: 0.1455, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 18:23:45] (step=0848600) Train Loss: 0.1457, Train Steps/Sec: 12.06, Grad Norm: 0.0061 +[2025-02-21 18:23:54] (step=0848700) Train Loss: 0.1451, Train Steps/Sec: 11.02, Grad Norm: 0.0062 +[2025-02-21 18:24:02] (step=0848800) Train Loss: 0.1450, Train Steps/Sec: 12.22, Grad Norm: 0.0062 +[2025-02-21 18:24:11] (step=0848900) Train Loss: 0.1468, Train Steps/Sec: 11.40, Grad Norm: 0.0061 +[2025-02-21 18:24:19] (step=0849000) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:24:27] (step=0849100) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 18:24:35] (step=0849200) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:24:43] (step=0849300) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:24:51] (step=0849400) Train Loss: 0.1450, Train Steps/Sec: 12.53, Grad Norm: 0.0057 +[2025-02-21 18:24:59] (step=0849500) Train Loss: 0.1466, Train Steps/Sec: 11.28, Grad Norm: 0.0062 +[2025-02-21 18:25:07] (step=0849600) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 18:25:15] (step=0849700) Train Loss: 0.1469, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:25:23] (step=0849800) Train Loss: 0.1449, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:25:31] (step=0849900) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 18:25:39] (step=0850000) Train Loss: 0.1446, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 18:25:47] (step=0850100) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:25:55] (step=0850200) Train Loss: 0.1442, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 18:26:05] (step=0850300) Train Loss: 0.1455, Train Steps/Sec: 10.63, Grad Norm: 0.0061 +[2025-02-21 18:26:14] (step=0850400) Train Loss: 0.1464, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 18:26:24] (step=0850500) Train Loss: 0.1455, Train Steps/Sec: 9.98, Grad Norm: 0.0062 +[2025-02-21 18:26:33] (step=0850600) Train Loss: 0.1452, Train Steps/Sec: 11.02, Grad Norm: 0.0057 +[2025-02-21 18:26:43] (step=0850700) Train Loss: 0.1463, Train Steps/Sec: 10.22, Grad Norm: 0.0061 +[2025-02-21 18:26:51] (step=0850800) Train Loss: 0.1457, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 18:27:00] (step=0850900) Train Loss: 0.1460, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 18:27:08] (step=0851000) Train Loss: 0.1461, Train Steps/Sec: 11.78, Grad Norm: 0.0064 +[2025-02-21 18:27:17] (step=0851100) Train Loss: 0.1465, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 18:27:25] (step=0851200) Train Loss: 0.1471, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 18:27:32] (step=0851300) Train Loss: 0.1444, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 18:27:40] (step=0851400) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 18:27:48] (step=0851500) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0068 +[2025-02-21 18:27:56] (step=0851600) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:28:04] (step=0851700) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 18:28:12] (step=0851800) Train Loss: 0.1451, Train Steps/Sec: 12.65, Grad Norm: 0.0058 +[2025-02-21 18:28:20] (step=0851900) Train Loss: 0.1458, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 18:28:29] (step=0852000) Train Loss: 0.1460, Train Steps/Sec: 11.25, Grad Norm: 0.0061 +[2025-02-21 18:28:37] (step=0852100) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 18:28:45] (step=0852200) Train Loss: 0.1445, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 18:28:53] (step=0852300) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 18:29:01] (step=0852400) Train Loss: 0.1458, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 18:29:10] (step=0852500) Train Loss: 0.1455, Train Steps/Sec: 10.63, Grad Norm: 0.0059 +[2025-02-21 18:29:21] (step=0852600) Train Loss: 0.1460, Train Steps/Sec: 9.69, Grad Norm: 0.0060 +[2025-02-21 18:29:30] (step=0852700) Train Loss: 0.1455, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 18:29:39] (step=0852800) Train Loss: 0.1448, Train Steps/Sec: 11.37, Grad Norm: 0.0065 +[2025-02-21 18:29:47] (step=0852900) Train Loss: 0.1450, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 18:29:56] (step=0853000) Train Loss: 0.1464, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 18:30:05] (step=0853100) Train Loss: 0.1457, Train Steps/Sec: 11.45, Grad Norm: 0.0060 +[2025-02-21 18:30:14] (step=0853200) Train Loss: 0.1457, Train Steps/Sec: 11.25, Grad Norm: 0.0062 +[2025-02-21 18:30:22] (step=0853300) Train Loss: 0.1457, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 18:30:30] (step=0853400) Train Loss: 0.1480, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:30:38] (step=0853500) Train Loss: 0.1439, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:30:46] (step=0853600) Train Loss: 0.1451, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:30:54] (step=0853700) Train Loss: 0.1469, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 18:31:02] (step=0853800) Train Loss: 0.1475, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 18:31:10] (step=0853900) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:31:18] (step=0854000) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:31:26] (step=0854100) Train Loss: 0.1441, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:31:34] (step=0854200) Train Loss: 0.1453, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 18:31:42] (step=0854300) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:31:49] (step=0854400) Train Loss: 0.1451, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 18:31:58] (step=0854500) Train Loss: 0.1445, Train Steps/Sec: 11.21, Grad Norm: 0.0060 +[2025-02-21 18:32:07] (step=0854600) Train Loss: 0.1446, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 18:32:16] (step=0854700) Train Loss: 0.1453, Train Steps/Sec: 10.97, Grad Norm: 0.0062 +[2025-02-21 18:32:26] (step=0854800) Train Loss: 0.1468, Train Steps/Sec: 9.96, Grad Norm: 0.0061 +[2025-02-21 18:32:35] (step=0854900) Train Loss: 0.1477, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 18:32:44] (step=0855000) Train Loss: 0.1470, Train Steps/Sec: 11.78, Grad Norm: 0.0063 +[2025-02-21 18:32:53] (step=0855100) Train Loss: 0.1464, Train Steps/Sec: 10.64, Grad Norm: 0.0059 +[2025-02-21 18:33:02] (step=0855200) Train Loss: 0.1451, Train Steps/Sec: 12.23, Grad Norm: 0.0059 +[2025-02-21 18:33:10] (step=0855300) Train Loss: 0.1445, Train Steps/Sec: 11.81, Grad Norm: 0.0059 +[2025-02-21 18:33:18] (step=0855400) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 18:33:26] (step=0855500) Train Loss: 0.1448, Train Steps/Sec: 12.18, Grad Norm: 0.0058 +[2025-02-21 18:33:34] (step=0855600) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:33:43] (step=0855700) Train Loss: 0.1460, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-21 18:33:51] (step=0855800) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 18:33:59] (step=0855900) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:34:07] (step=0856000) Train Loss: 0.1446, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 18:34:15] (step=0856100) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:34:23] (step=0856200) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:34:31] (step=0856300) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:34:39] (step=0856400) Train Loss: 0.1446, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 18:34:46] (step=0856500) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 18:34:54] (step=0856600) Train Loss: 0.1470, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 18:35:02] (step=0856700) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 18:35:11] (step=0856800) Train Loss: 0.1448, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 18:35:20] (step=0856900) Train Loss: 0.1461, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 18:35:32] (step=0857000) Train Loss: 0.1450, Train Steps/Sec: 8.66, Grad Norm: 0.0061 +[2025-02-21 18:35:40] (step=0857100) Train Loss: 0.1451, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 18:35:50] (step=0857200) Train Loss: 0.1464, Train Steps/Sec: 10.64, Grad Norm: 0.0058 +[2025-02-21 18:35:59] (step=0857300) Train Loss: 0.1473, Train Steps/Sec: 10.99, Grad Norm: 0.0057 +[2025-02-21 18:36:07] (step=0857400) Train Loss: 0.1471, Train Steps/Sec: 11.77, Grad Norm: 0.0060 +[2025-02-21 18:36:15] (step=0857500) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 18:36:23] (step=0857600) Train Loss: 0.1441, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:36:31] (step=0857700) Train Loss: 0.1461, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 18:36:39] (step=0857800) Train Loss: 0.1465, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:36:47] (step=0857900) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 18:36:55] (step=0858000) Train Loss: 0.1465, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 18:37:03] (step=0858100) Train Loss: 0.1461, Train Steps/Sec: 12.65, Grad Norm: 0.0062 +[2025-02-21 18:37:12] (step=0858200) Train Loss: 0.1461, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-21 18:37:20] (step=0858300) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 18:37:28] (step=0858400) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 18:37:36] (step=0858500) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 18:37:44] (step=0858600) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 18:37:52] (step=0858700) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 18:38:00] (step=0858800) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 18:38:08] (step=0858900) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 18:38:16] (step=0859000) Train Loss: 0.1451, Train Steps/Sec: 11.33, Grad Norm: 0.0061 +[2025-02-21 18:38:26] (step=0859100) Train Loss: 0.1464, Train Steps/Sec: 10.27, Grad Norm: 0.0059 +[2025-02-21 18:38:36] (step=0859200) Train Loss: 0.1450, Train Steps/Sec: 9.89, Grad Norm: 0.0060 +[2025-02-21 18:38:45] (step=0859300) Train Loss: 0.1459, Train Steps/Sec: 11.27, Grad Norm: 0.0063 +[2025-02-21 18:38:55] (step=0859400) Train Loss: 0.1453, Train Steps/Sec: 9.95, Grad Norm: 0.0060 +[2025-02-21 18:39:05] (step=0859500) Train Loss: 0.1459, Train Steps/Sec: 9.89, Grad Norm: 0.0064 +[2025-02-21 18:39:14] (step=0859600) Train Loss: 0.1475, Train Steps/Sec: 12.20, Grad Norm: 0.0059 +[2025-02-21 18:39:21] (step=0859700) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:39:29] (step=0859800) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 18:39:38] (step=0859900) Train Loss: 0.1455, Train Steps/Sec: 12.21, Grad Norm: 0.0060 +[2025-02-21 18:39:46] (step=0860000) Train Loss: 0.1478, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:39:51] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0860000.pt +[2025-02-21 18:39:59] (step=0860100) Train Loss: 0.1463, Train Steps/Sec: 7.47, Grad Norm: 0.0057 +[2025-02-21 18:40:07] (step=0860200) Train Loss: 0.1466, Train Steps/Sec: 12.62, Grad Norm: 0.0067 +[2025-02-21 18:40:15] (step=0860300) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:40:23] (step=0860400) Train Loss: 0.1476, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:40:31] (step=0860500) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:40:39] (step=0860600) Train Loss: 0.1452, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 18:40:47] (step=0860700) Train Loss: 0.1456, Train Steps/Sec: 11.21, Grad Norm: 0.0059 +[2025-02-21 18:40:55] (step=0860800) Train Loss: 0.1455, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 18:41:03] (step=0860900) Train Loss: 0.1449, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 18:41:11] (step=0861000) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 18:41:19] (step=0861100) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:41:28] (step=0861200) Train Loss: 0.1465, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 18:41:38] (step=0861300) Train Loss: 0.1452, Train Steps/Sec: 10.31, Grad Norm: 0.0059 +[2025-02-21 18:41:48] (step=0861400) Train Loss: 0.1462, Train Steps/Sec: 9.73, Grad Norm: 0.0058 +[2025-02-21 18:41:57] (step=0861500) Train Loss: 0.1466, Train Steps/Sec: 10.68, Grad Norm: 0.0061 +[2025-02-21 18:42:07] (step=0861600) Train Loss: 0.1452, Train Steps/Sec: 10.69, Grad Norm: 0.0062 +[2025-02-21 18:42:16] (step=0861700) Train Loss: 0.1463, Train Steps/Sec: 11.01, Grad Norm: 0.0060 +[2025-02-21 18:42:24] (step=0861800) Train Loss: 0.1446, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 18:42:32] (step=0861900) Train Loss: 0.1447, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 18:42:41] (step=0862000) Train Loss: 0.1472, Train Steps/Sec: 10.92, Grad Norm: 0.0063 +[2025-02-21 18:42:49] (step=0862100) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 18:42:57] (step=0862200) Train Loss: 0.1465, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 18:43:05] (step=0862300) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 18:43:12] (step=0862400) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 18:43:20] (step=0862500) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 18:43:28] (step=0862600) Train Loss: 0.1454, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 18:43:36] (step=0862700) Train Loss: 0.1465, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 18:43:44] (step=0862800) Train Loss: 0.1444, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 18:43:52] (step=0862900) Train Loss: 0.1450, Train Steps/Sec: 12.65, Grad Norm: 0.0064 +[2025-02-21 18:44:00] (step=0863000) Train Loss: 0.1472, Train Steps/Sec: 12.66, Grad Norm: 0.0061 +[2025-02-21 18:44:08] (step=0863100) Train Loss: 0.1463, Train Steps/Sec: 12.65, Grad Norm: 0.0065 +[2025-02-21 18:44:17] (step=0863200) Train Loss: 0.1454, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 18:44:25] (step=0863300) Train Loss: 0.1459, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 18:44:34] (step=0863400) Train Loss: 0.1467, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-21 18:44:44] (step=0863500) Train Loss: 0.1465, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-21 18:44:54] (step=0863600) Train Loss: 0.1448, Train Steps/Sec: 9.44, Grad Norm: 0.0058 +[2025-02-21 18:45:04] (step=0863700) Train Loss: 0.1457, Train Steps/Sec: 10.67, Grad Norm: 0.0062 +[2025-02-21 18:45:13] (step=0863800) Train Loss: 0.1467, Train Steps/Sec: 10.67, Grad Norm: 0.0063 +[2025-02-21 18:45:21] (step=0863900) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 18:45:29] (step=0864000) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 18:45:37] (step=0864100) Train Loss: 0.1445, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:45:46] (step=0864200) Train Loss: 0.1466, Train Steps/Sec: 12.19, Grad Norm: 0.0068 +[2025-02-21 18:45:53] (step=0864300) Train Loss: 0.1461, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 18:46:01] (step=0864400) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:46:10] (step=0864500) Train Loss: 0.1463, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-21 18:46:18] (step=0864600) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 18:46:26] (step=0864700) Train Loss: 0.1449, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 18:46:34] (step=0864800) Train Loss: 0.1458, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 18:46:42] (step=0864900) Train Loss: 0.1457, Train Steps/Sec: 12.65, Grad Norm: 0.0065 +[2025-02-21 18:46:50] (step=0865000) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 18:46:58] (step=0865100) Train Loss: 0.1456, Train Steps/Sec: 12.65, Grad Norm: 0.0058 +[2025-02-21 18:47:06] (step=0865200) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:47:14] (step=0865300) Train Loss: 0.1476, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 18:47:21] (step=0865400) Train Loss: 0.1463, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 18:47:30] (step=0865500) Train Loss: 0.1455, Train Steps/Sec: 11.81, Grad Norm: 0.0061 +[2025-02-21 18:47:39] (step=0865600) Train Loss: 0.1453, Train Steps/Sec: 10.61, Grad Norm: 0.0059 +[2025-02-21 18:47:49] (step=0865700) Train Loss: 0.1461, Train Steps/Sec: 10.19, Grad Norm: 0.0061 +[2025-02-21 18:48:00] (step=0865800) Train Loss: 0.1456, Train Steps/Sec: 9.13, Grad Norm: 0.0058 +[2025-02-21 18:48:09] (step=0865900) Train Loss: 0.1471, Train Steps/Sec: 10.97, Grad Norm: 0.0059 +[2025-02-21 18:48:18] (step=0866000) Train Loss: 0.1467, Train Steps/Sec: 11.34, Grad Norm: 0.0063 +[2025-02-21 18:48:27] (step=0866100) Train Loss: 0.1457, Train Steps/Sec: 11.70, Grad Norm: 0.0058 +[2025-02-21 18:48:35] (step=0866200) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 18:48:43] (step=0866300) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 18:48:51] (step=0866400) Train Loss: 0.1462, Train Steps/Sec: 12.09, Grad Norm: 0.0060 +[2025-02-21 18:48:59] (step=0866500) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:49:07] (step=0866600) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 18:49:15] (step=0866700) Train Loss: 0.1445, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 18:49:23] (step=0866800) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 18:49:30] (step=0866900) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 18:49:39] (step=0867000) Train Loss: 0.1459, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 18:49:47] (step=0867100) Train Loss: 0.1464, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 18:49:55] (step=0867200) Train Loss: 0.1461, Train Steps/Sec: 12.54, Grad Norm: 0.0058 +[2025-02-21 18:50:03] (step=0867300) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 18:50:11] (step=0867400) Train Loss: 0.1458, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 18:50:19] (step=0867500) Train Loss: 0.1468, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 18:50:27] (step=0867600) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 18:50:36] (step=0867700) Train Loss: 0.1440, Train Steps/Sec: 11.40, Grad Norm: 0.0062 +[2025-02-21 18:50:45] (step=0867800) Train Loss: 0.1465, Train Steps/Sec: 10.63, Grad Norm: 0.0063 +[2025-02-21 18:50:55] (step=0867900) Train Loss: 0.1466, Train Steps/Sec: 10.56, Grad Norm: 0.0059 +[2025-02-21 18:51:05] (step=0868000) Train Loss: 0.1457, Train Steps/Sec: 9.44, Grad Norm: 0.0059 +[2025-02-21 18:51:14] (step=0868100) Train Loss: 0.1470, Train Steps/Sec: 10.99, Grad Norm: 0.0062 +[2025-02-21 18:51:24] (step=0868200) Train Loss: 0.1464, Train Steps/Sec: 10.90, Grad Norm: 0.0059 +[2025-02-21 18:51:32] (step=0868300) Train Loss: 0.1481, Train Steps/Sec: 11.69, Grad Norm: 0.0059 +[2025-02-21 18:51:40] (step=0868400) Train Loss: 0.1476, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:51:48] (step=0868500) Train Loss: 0.1438, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:51:56] (step=0868600) Train Loss: 0.1452, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 18:52:04] (step=0868700) Train Loss: 0.1459, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 18:52:12] (step=0868800) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 18:52:20] (step=0868900) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:52:28] (step=0869000) Train Loss: 0.1448, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:52:36] (step=0869100) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 18:52:44] (step=0869200) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 18:52:52] (step=0869300) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 18:53:00] (step=0869400) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:53:08] (step=0869500) Train Loss: 0.1454, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-21 18:53:16] (step=0869600) Train Loss: 0.1458, Train Steps/Sec: 12.65, Grad Norm: 0.0058 +[2025-02-21 18:53:24] (step=0869700) Train Loss: 0.1459, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 18:53:32] (step=0869800) Train Loss: 0.1452, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 18:53:41] (step=0869900) Train Loss: 0.1462, Train Steps/Sec: 11.40, Grad Norm: 0.0059 +[2025-02-21 18:53:51] (step=0870000) Train Loss: 0.1458, Train Steps/Sec: 10.33, Grad Norm: 0.0060 +[2025-02-21 18:54:01] (step=0870100) Train Loss: 0.1454, Train Steps/Sec: 9.70, Grad Norm: 0.0060 +[2025-02-21 18:54:11] (step=0870200) Train Loss: 0.1446, Train Steps/Sec: 10.66, Grad Norm: 0.0058 +[2025-02-21 18:54:19] (step=0870300) Train Loss: 0.1459, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 18:54:28] (step=0870400) Train Loss: 0.1449, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 18:54:36] (step=0870500) Train Loss: 0.1446, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 18:54:44] (step=0870600) Train Loss: 0.1442, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 18:54:53] (step=0870700) Train Loss: 0.1468, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-21 18:55:01] (step=0870800) Train Loss: 0.1443, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 18:55:09] (step=0870900) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 18:55:17] (step=0871000) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 18:55:25] (step=0871100) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:55:33] (step=0871200) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:55:41] (step=0871300) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 18:55:49] (step=0871400) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 18:55:57] (step=0871500) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 18:56:05] (step=0871600) Train Loss: 0.1470, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 18:56:13] (step=0871700) Train Loss: 0.1443, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 18:56:20] (step=0871800) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:56:29] (step=0871900) Train Loss: 0.1451, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 18:56:38] (step=0872000) Train Loss: 0.1469, Train Steps/Sec: 11.24, Grad Norm: 0.0058 +[2025-02-21 18:56:47] (step=0872100) Train Loss: 0.1459, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 18:56:57] (step=0872200) Train Loss: 0.1461, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-21 18:57:07] (step=0872300) Train Loss: 0.1433, Train Steps/Sec: 9.72, Grad Norm: 0.0060 +[2025-02-21 18:57:16] (step=0872400) Train Loss: 0.1452, Train Steps/Sec: 10.99, Grad Norm: 0.0059 +[2025-02-21 18:57:25] (step=0872500) Train Loss: 0.1461, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 18:57:33] (step=0872600) Train Loss: 0.1453, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 18:57:42] (step=0872700) Train Loss: 0.1456, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 18:57:50] (step=0872800) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 18:57:58] (step=0872900) Train Loss: 0.1464, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 18:58:06] (step=0873000) Train Loss: 0.1455, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 18:58:14] (step=0873100) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 18:58:23] (step=0873200) Train Loss: 0.1446, Train Steps/Sec: 11.28, Grad Norm: 0.0063 +[2025-02-21 18:58:30] (step=0873300) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 18:58:38] (step=0873400) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 18:58:46] (step=0873500) Train Loss: 0.1446, Train Steps/Sec: 12.62, Grad Norm: 0.0069 +[2025-02-21 18:58:54] (step=0873600) Train Loss: 0.1463, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 18:59:02] (step=0873700) Train Loss: 0.1450, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 18:59:10] (step=0873800) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 18:59:18] (step=0873900) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 18:59:26] (step=0874000) Train Loss: 0.1448, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 18:59:34] (step=0874100) Train Loss: 0.1477, Train Steps/Sec: 12.21, Grad Norm: 0.0059 +[2025-02-21 18:59:42] (step=0874200) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 18:59:52] (step=0874300) Train Loss: 0.1455, Train Steps/Sec: 10.31, Grad Norm: 0.0063 +[2025-02-21 19:00:02] (step=0874400) Train Loss: 0.1461, Train Steps/Sec: 10.04, Grad Norm: 0.0063 +[2025-02-21 19:00:13] (step=0874500) Train Loss: 0.1465, Train Steps/Sec: 9.06, Grad Norm: 0.0062 +[2025-02-21 19:00:22] (step=0874600) Train Loss: 0.1458, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-21 19:00:30] (step=0874700) Train Loss: 0.1452, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 19:00:39] (step=0874800) Train Loss: 0.1456, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 19:00:47] (step=0874900) Train Loss: 0.1445, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 19:00:55] (step=0875000) Train Loss: 0.1434, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 19:01:03] (step=0875100) Train Loss: 0.1471, Train Steps/Sec: 12.17, Grad Norm: 0.0062 +[2025-02-21 19:01:11] (step=0875200) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 19:01:19] (step=0875300) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 19:01:27] (step=0875400) Train Loss: 0.1447, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 19:01:35] (step=0875500) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 19:01:43] (step=0875600) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 19:01:51] (step=0875700) Train Loss: 0.1454, Train Steps/Sec: 12.02, Grad Norm: 0.0060 +[2025-02-21 19:02:00] (step=0875800) Train Loss: 0.1450, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 19:02:08] (step=0875900) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 19:02:16] (step=0876000) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 19:02:24] (step=0876100) Train Loss: 0.1463, Train Steps/Sec: 12.56, Grad Norm: 0.0064 +[2025-02-21 19:02:32] (step=0876200) Train Loss: 0.1457, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 19:02:40] (step=0876300) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:02:49] (step=0876400) Train Loss: 0.1456, Train Steps/Sec: 11.39, Grad Norm: 0.0062 +[2025-02-21 19:02:58] (step=0876500) Train Loss: 0.1458, Train Steps/Sec: 10.32, Grad Norm: 0.0062 +[2025-02-21 19:03:07] (step=0876600) Train Loss: 0.1460, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 19:03:17] (step=0876700) Train Loss: 0.1464, Train Steps/Sec: 10.19, Grad Norm: 0.0059 +[2025-02-21 19:03:26] (step=0876800) Train Loss: 0.1470, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 19:03:35] (step=0876900) Train Loss: 0.1441, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 19:03:45] (step=0877000) Train Loss: 0.1451, Train Steps/Sec: 10.25, Grad Norm: 0.0058 +[2025-02-21 19:03:53] (step=0877100) Train Loss: 0.1459, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 19:04:01] (step=0877200) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:04:09] (step=0877300) Train Loss: 0.1448, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 19:04:17] (step=0877400) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 19:04:25] (step=0877500) Train Loss: 0.1455, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 19:04:33] (step=0877600) Train Loss: 0.1460, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 19:04:40] (step=0877700) Train Loss: 0.1465, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 19:04:48] (step=0877800) Train Loss: 0.1468, Train Steps/Sec: 12.66, Grad Norm: 0.0059 +[2025-02-21 19:04:56] (step=0877900) Train Loss: 0.1456, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 19:05:04] (step=0878000) Train Loss: 0.1470, Train Steps/Sec: 12.65, Grad Norm: 0.0061 +[2025-02-21 19:05:12] (step=0878100) Train Loss: 0.1457, Train Steps/Sec: 12.66, Grad Norm: 0.0057 +[2025-02-21 19:05:20] (step=0878200) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 19:05:29] (step=0878300) Train Loss: 0.1458, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 19:05:37] (step=0878400) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 19:05:45] (step=0878500) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 19:05:54] (step=0878600) Train Loss: 0.1462, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 19:06:04] (step=0878700) Train Loss: 0.1458, Train Steps/Sec: 9.98, Grad Norm: 0.0063 +[2025-02-21 19:06:14] (step=0878800) Train Loss: 0.1454, Train Steps/Sec: 10.35, Grad Norm: 0.0061 +[2025-02-21 19:06:23] (step=0878900) Train Loss: 0.1458, Train Steps/Sec: 10.64, Grad Norm: 0.0061 +[2025-02-21 19:06:32] (step=0879000) Train Loss: 0.1449, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 19:06:41] (step=0879100) Train Loss: 0.1461, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 19:06:49] (step=0879200) Train Loss: 0.1461, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-21 19:06:57] (step=0879300) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 19:07:05] (step=0879400) Train Loss: 0.1448, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 19:07:14] (step=0879500) Train Loss: 0.1464, Train Steps/Sec: 10.83, Grad Norm: 0.0063 +[2025-02-21 19:07:22] (step=0879600) Train Loss: 0.1443, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 19:07:30] (step=0879700) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 19:07:38] (step=0879800) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 19:07:46] (step=0879900) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:07:54] (step=0880000) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0064 +[2025-02-21 19:07:57] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0880000.pt +[2025-02-21 19:08:05] (step=0880100) Train Loss: 0.1459, Train Steps/Sec: 9.02, Grad Norm: 0.0058 +[2025-02-21 19:08:13] (step=0880200) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 19:08:21] (step=0880300) Train Loss: 0.1468, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 19:08:29] (step=0880400) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 19:08:37] (step=0880500) Train Loss: 0.1462, Train Steps/Sec: 12.20, Grad Norm: 0.0061 +[2025-02-21 19:08:45] (step=0880600) Train Loss: 0.1445, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:08:54] (step=0880700) Train Loss: 0.1435, Train Steps/Sec: 11.75, Grad Norm: 0.0058 +[2025-02-21 19:09:04] (step=0880800) Train Loss: 0.1456, Train Steps/Sec: 10.01, Grad Norm: 0.0061 +[2025-02-21 19:09:13] (step=0880900) Train Loss: 0.1458, Train Steps/Sec: 10.59, Grad Norm: 0.0057 +[2025-02-21 19:09:22] (step=0881000) Train Loss: 0.1454, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-21 19:09:32] (step=0881100) Train Loss: 0.1469, Train Steps/Sec: 10.60, Grad Norm: 0.0060 +[2025-02-21 19:09:41] (step=0881200) Train Loss: 0.1450, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 19:09:49] (step=0881300) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0065 +[2025-02-21 19:09:58] (step=0881400) Train Loss: 0.1457, Train Steps/Sec: 11.38, Grad Norm: 0.0062 +[2025-02-21 19:10:06] (step=0881500) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 19:10:14] (step=0881600) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 19:10:22] (step=0881700) Train Loss: 0.1461, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 19:10:30] (step=0881800) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 19:10:38] (step=0881900) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:10:47] (step=0882000) Train Loss: 0.1451, Train Steps/Sec: 11.23, Grad Norm: 0.0057 +[2025-02-21 19:10:54] (step=0882100) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:11:02] (step=0882200) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 19:11:10] (step=0882300) Train Loss: 0.1447, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 19:11:18] (step=0882400) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 19:11:26] (step=0882500) Train Loss: 0.1471, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 19:11:34] (step=0882600) Train Loss: 0.1457, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 19:11:42] (step=0882700) Train Loss: 0.1465, Train Steps/Sec: 12.66, Grad Norm: 0.0060 +[2025-02-21 19:11:51] (step=0882800) Train Loss: 0.1451, Train Steps/Sec: 11.82, Grad Norm: 0.0059 +[2025-02-21 19:11:59] (step=0882900) Train Loss: 0.1458, Train Steps/Sec: 11.82, Grad Norm: 0.0060 +[2025-02-21 19:12:08] (step=0883000) Train Loss: 0.1446, Train Steps/Sec: 11.03, Grad Norm: 0.0060 +[2025-02-21 19:12:18] (step=0883100) Train Loss: 0.1456, Train Steps/Sec: 10.30, Grad Norm: 0.0059 +[2025-02-21 19:12:27] (step=0883200) Train Loss: 0.1441, Train Steps/Sec: 11.28, Grad Norm: 0.0058 +[2025-02-21 19:12:37] (step=0883300) Train Loss: 0.1452, Train Steps/Sec: 9.64, Grad Norm: 0.0060 +[2025-02-21 19:12:46] (step=0883400) Train Loss: 0.1466, Train Steps/Sec: 11.31, Grad Norm: 0.0060 +[2025-02-21 19:12:54] (step=0883500) Train Loss: 0.1454, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 19:13:03] (step=0883600) Train Loss: 0.1466, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 19:13:11] (step=0883700) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:13:19] (step=0883800) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 19:13:27] (step=0883900) Train Loss: 0.1462, Train Steps/Sec: 12.17, Grad Norm: 0.0057 +[2025-02-21 19:13:35] (step=0884000) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:13:43] (step=0884100) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 19:13:51] (step=0884200) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 19:13:59] (step=0884300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 19:14:07] (step=0884400) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 19:14:16] (step=0884500) Train Loss: 0.1455, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 19:14:24] (step=0884600) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 19:14:32] (step=0884700) Train Loss: 0.1444, Train Steps/Sec: 12.21, Grad Norm: 0.0059 +[2025-02-21 19:14:40] (step=0884800) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:14:48] (step=0884900) Train Loss: 0.1462, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 19:14:57] (step=0885000) Train Loss: 0.1456, Train Steps/Sec: 11.82, Grad Norm: 0.0057 +[2025-02-21 19:15:05] (step=0885100) Train Loss: 0.1451, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 19:15:14] (step=0885200) Train Loss: 0.1455, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 19:15:23] (step=0885300) Train Loss: 0.1459, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 19:15:32] (step=0885400) Train Loss: 0.1454, Train Steps/Sec: 10.95, Grad Norm: 0.0059 +[2025-02-21 19:15:41] (step=0885500) Train Loss: 0.1455, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 19:15:50] (step=0885600) Train Loss: 0.1464, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 19:15:59] (step=0885700) Train Loss: 0.1470, Train Steps/Sec: 11.99, Grad Norm: 0.0060 +[2025-02-21 19:16:08] (step=0885800) Train Loss: 0.1452, Train Steps/Sec: 10.22, Grad Norm: 0.0060 +[2025-02-21 19:16:16] (step=0885900) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 19:16:24] (step=0886000) Train Loss: 0.1445, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 19:16:33] (step=0886100) Train Loss: 0.1457, Train Steps/Sec: 12.12, Grad Norm: 0.0058 +[2025-02-21 19:16:41] (step=0886200) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 19:16:49] (step=0886300) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:16:56] (step=0886400) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 19:17:04] (step=0886500) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 19:17:12] (step=0886600) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:17:20] (step=0886700) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 19:17:28] (step=0886800) Train Loss: 0.1446, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:17:36] (step=0886900) Train Loss: 0.1439, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 19:17:45] (step=0887000) Train Loss: 0.1456, Train Steps/Sec: 11.26, Grad Norm: 0.0060 +[2025-02-21 19:17:54] (step=0887100) Train Loss: 0.1449, Train Steps/Sec: 11.41, Grad Norm: 0.0060 +[2025-02-21 19:18:03] (step=0887200) Train Loss: 0.1473, Train Steps/Sec: 11.76, Grad Norm: 0.0063 +[2025-02-21 19:18:11] (step=0887300) Train Loss: 0.1460, Train Steps/Sec: 12.21, Grad Norm: 0.0062 +[2025-02-21 19:18:20] (step=0887400) Train Loss: 0.1447, Train Steps/Sec: 11.00, Grad Norm: 0.0058 +[2025-02-21 19:18:29] (step=0887500) Train Loss: 0.1457, Train Steps/Sec: 11.01, Grad Norm: 0.0062 +[2025-02-21 19:18:38] (step=0887600) Train Loss: 0.1452, Train Steps/Sec: 10.61, Grad Norm: 0.0063 +[2025-02-21 19:18:48] (step=0887700) Train Loss: 0.1451, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 19:18:56] (step=0887800) Train Loss: 0.1460, Train Steps/Sec: 11.78, Grad Norm: 0.0061 +[2025-02-21 19:19:04] (step=0887900) Train Loss: 0.1449, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 19:19:13] (step=0888000) Train Loss: 0.1461, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 19:19:21] (step=0888100) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 19:19:29] (step=0888200) Train Loss: 0.1451, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 19:19:38] (step=0888300) Train Loss: 0.1451, Train Steps/Sec: 10.88, Grad Norm: 0.0058 +[2025-02-21 19:19:46] (step=0888400) Train Loss: 0.1465, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 19:19:54] (step=0888500) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:20:02] (step=0888600) Train Loss: 0.1456, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 19:20:10] (step=0888700) Train Loss: 0.1467, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 19:20:18] (step=0888800) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 19:20:26] (step=0888900) Train Loss: 0.1440, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 19:20:34] (step=0889000) Train Loss: 0.1455, Train Steps/Sec: 12.21, Grad Norm: 0.0061 +[2025-02-21 19:20:42] (step=0889100) Train Loss: 0.1451, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 19:20:50] (step=0889200) Train Loss: 0.1460, Train Steps/Sec: 11.76, Grad Norm: 0.0062 +[2025-02-21 19:20:59] (step=0889300) Train Loss: 0.1462, Train Steps/Sec: 11.83, Grad Norm: 0.0061 +[2025-02-21 19:21:07] (step=0889400) Train Loss: 0.1456, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 19:21:16] (step=0889500) Train Loss: 0.1445, Train Steps/Sec: 10.60, Grad Norm: 0.0059 +[2025-02-21 19:21:26] (step=0889600) Train Loss: 0.1453, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-21 19:21:36] (step=0889700) Train Loss: 0.1449, Train Steps/Sec: 10.28, Grad Norm: 0.0063 +[2025-02-21 19:21:44] (step=0889800) Train Loss: 0.1449, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 19:21:53] (step=0889900) Train Loss: 0.1462, Train Steps/Sec: 11.31, Grad Norm: 0.0065 +[2025-02-21 19:22:02] (step=0890000) Train Loss: 0.1450, Train Steps/Sec: 11.76, Grad Norm: 0.0059 +[2025-02-21 19:22:10] (step=0890100) Train Loss: 0.1466, Train Steps/Sec: 11.70, Grad Norm: 0.0057 +[2025-02-21 19:22:19] (step=0890200) Train Loss: 0.1460, Train Steps/Sec: 11.75, Grad Norm: 0.0061 +[2025-02-21 19:22:27] (step=0890300) Train Loss: 0.1446, Train Steps/Sec: 12.62, Grad Norm: 0.0057 +[2025-02-21 19:22:35] (step=0890400) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 19:22:43] (step=0890500) Train Loss: 0.1458, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 19:22:51] (step=0890600) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 19:22:59] (step=0890700) Train Loss: 0.1454, Train Steps/Sec: 12.52, Grad Norm: 0.0060 +[2025-02-21 19:23:08] (step=0890800) Train Loss: 0.1464, Train Steps/Sec: 11.23, Grad Norm: 0.0062 +[2025-02-21 19:23:16] (step=0890900) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 19:23:24] (step=0891000) Train Loss: 0.1469, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 19:23:32] (step=0891100) Train Loss: 0.1449, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:23:40] (step=0891200) Train Loss: 0.1467, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 19:23:48] (step=0891300) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 19:23:57] (step=0891400) Train Loss: 0.1460, Train Steps/Sec: 11.09, Grad Norm: 0.0062 +[2025-02-21 19:24:05] (step=0891500) Train Loss: 0.1463, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 19:24:13] (step=0891600) Train Loss: 0.1464, Train Steps/Sec: 11.76, Grad Norm: 0.0063 +[2025-02-21 19:24:21] (step=0891700) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 19:24:31] (step=0891800) Train Loss: 0.1465, Train Steps/Sec: 10.55, Grad Norm: 0.0059 +[2025-02-21 19:24:41] (step=0891900) Train Loss: 0.1457, Train Steps/Sec: 10.28, Grad Norm: 0.0061 +[2025-02-21 19:24:51] (step=0892000) Train Loss: 0.1468, Train Steps/Sec: 9.86, Grad Norm: 0.0061 +[2025-02-21 19:24:59] (step=0892100) Train Loss: 0.1460, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-21 19:25:08] (step=0892200) Train Loss: 0.1459, Train Steps/Sec: 11.71, Grad Norm: 0.0058 +[2025-02-21 19:25:16] (step=0892300) Train Loss: 0.1465, Train Steps/Sec: 11.69, Grad Norm: 0.0060 +[2025-02-21 19:25:25] (step=0892400) Train Loss: 0.1453, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 19:25:33] (step=0892500) Train Loss: 0.1443, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:25:41] (step=0892600) Train Loss: 0.1462, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 19:25:49] (step=0892700) Train Loss: 0.1438, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 19:25:57] (step=0892800) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 19:26:05] (step=0892900) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:26:13] (step=0893000) Train Loss: 0.1476, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 19:26:21] (step=0893100) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 19:26:29] (step=0893200) Train Loss: 0.1453, Train Steps/Sec: 12.57, Grad Norm: 0.0057 +[2025-02-21 19:26:38] (step=0893300) Train Loss: 0.1457, Train Steps/Sec: 10.85, Grad Norm: 0.0060 +[2025-02-21 19:26:46] (step=0893400) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:26:54] (step=0893500) Train Loss: 0.1447, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 19:27:03] (step=0893600) Train Loss: 0.1452, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 19:27:11] (step=0893700) Train Loss: 0.1452, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 19:27:20] (step=0893800) Train Loss: 0.1456, Train Steps/Sec: 12.20, Grad Norm: 0.0065 +[2025-02-21 19:27:28] (step=0893900) Train Loss: 0.1459, Train Steps/Sec: 12.21, Grad Norm: 0.0060 +[2025-02-21 19:27:37] (step=0894000) Train Loss: 0.1458, Train Steps/Sec: 10.67, Grad Norm: 0.0059 +[2025-02-21 19:27:47] (step=0894100) Train Loss: 0.1452, Train Steps/Sec: 10.62, Grad Norm: 0.0064 +[2025-02-21 19:27:56] (step=0894200) Train Loss: 0.1462, Train Steps/Sec: 10.96, Grad Norm: 0.0060 +[2025-02-21 19:28:04] (step=0894300) Train Loss: 0.1452, Train Steps/Sec: 11.67, Grad Norm: 0.0062 +[2025-02-21 19:28:13] (step=0894400) Train Loss: 0.1436, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 19:28:22] (step=0894500) Train Loss: 0.1465, Train Steps/Sec: 10.48, Grad Norm: 0.0060 +[2025-02-21 19:28:31] (step=0894600) Train Loss: 0.1452, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 19:28:39] (step=0894700) Train Loss: 0.1458, Train Steps/Sec: 12.34, Grad Norm: 0.0060 +[2025-02-21 19:28:47] (step=0894800) Train Loss: 0.1454, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 19:28:55] (step=0894900) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 19:29:03] (step=0895000) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0055 +[2025-02-21 19:29:11] (step=0895100) Train Loss: 0.1447, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 19:29:19] (step=0895200) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 19:29:27] (step=0895300) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 19:29:35] (step=0895400) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 19:29:43] (step=0895500) Train Loss: 0.1455, Train Steps/Sec: 12.20, Grad Norm: 0.0058 +[2025-02-21 19:29:51] (step=0895600) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 19:30:00] (step=0895700) Train Loss: 0.1460, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 19:30:09] (step=0895800) Train Loss: 0.1450, Train Steps/Sec: 10.96, Grad Norm: 0.0063 +[2025-02-21 19:30:18] (step=0895900) Train Loss: 0.1467, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 19:30:26] (step=0896000) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 19:30:34] (step=0896100) Train Loss: 0.1462, Train Steps/Sec: 11.76, Grad Norm: 0.0062 +[2025-02-21 19:30:43] (step=0896200) Train Loss: 0.1451, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 19:30:53] (step=0896300) Train Loss: 0.1459, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 19:31:02] (step=0896400) Train Loss: 0.1466, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 19:31:10] (step=0896500) Train Loss: 0.1460, Train Steps/Sec: 11.59, Grad Norm: 0.0067 +[2025-02-21 19:31:19] (step=0896600) Train Loss: 0.1460, Train Steps/Sec: 11.73, Grad Norm: 0.0057 +[2025-02-21 19:31:27] (step=0896700) Train Loss: 0.1449, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 19:31:36] (step=0896800) Train Loss: 0.1452, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 19:31:44] (step=0896900) Train Loss: 0.1460, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 19:31:53] (step=0897000) Train Loss: 0.1455, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-21 19:32:01] (step=0897100) Train Loss: 0.1452, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 19:32:09] (step=0897200) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 19:32:17] (step=0897300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 19:32:25] (step=0897400) Train Loss: 0.1454, Train Steps/Sec: 12.53, Grad Norm: 0.0061 +[2025-02-21 19:32:33] (step=0897500) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 19:32:41] (step=0897600) Train Loss: 0.1449, Train Steps/Sec: 12.19, Grad Norm: 0.0061 +[2025-02-21 19:32:49] (step=0897700) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:32:58] (step=0897800) Train Loss: 0.1455, Train Steps/Sec: 11.06, Grad Norm: 0.0061 +[2025-02-21 19:33:06] (step=0897900) Train Loss: 0.1447, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 19:33:15] (step=0898000) Train Loss: 0.1460, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 19:33:23] (step=0898100) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 19:33:31] (step=0898200) Train Loss: 0.1579, Train Steps/Sec: 12.52, Grad Norm: 0.2331 +[2025-02-21 19:33:40] (step=0898300) Train Loss: 0.1464, Train Steps/Sec: 10.55, Grad Norm: 0.0064 +[2025-02-21 19:33:50] (step=0898400) Train Loss: 0.1469, Train Steps/Sec: 10.64, Grad Norm: 0.0060 +[2025-02-21 19:33:59] (step=0898500) Train Loss: 0.1453, Train Steps/Sec: 10.61, Grad Norm: 0.0059 +[2025-02-21 19:34:08] (step=0898600) Train Loss: 0.1454, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 19:34:16] (step=0898700) Train Loss: 0.1469, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 19:34:25] (step=0898800) Train Loss: 0.1467, Train Steps/Sec: 11.78, Grad Norm: 0.0059 +[2025-02-21 19:34:33] (step=0898900) Train Loss: 0.1459, Train Steps/Sec: 11.75, Grad Norm: 0.0057 +[2025-02-21 19:34:42] (step=0899000) Train Loss: 0.1452, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 19:34:50] (step=0899100) Train Loss: 0.1445, Train Steps/Sec: 12.64, Grad Norm: 0.0058 +[2025-02-21 19:34:58] (step=0899200) Train Loss: 0.1448, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 19:35:06] (step=0899300) Train Loss: 0.1468, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 19:35:14] (step=0899400) Train Loss: 0.1445, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:35:23] (step=0899500) Train Loss: 0.1448, Train Steps/Sec: 11.21, Grad Norm: 0.0062 +[2025-02-21 19:35:31] (step=0899600) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:35:39] (step=0899700) Train Loss: 0.1451, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 19:35:47] (step=0899800) Train Loss: 0.1452, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 19:35:55] (step=0899900) Train Loss: 0.1452, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 19:36:04] (step=0900000) Train Loss: 0.1465, Train Steps/Sec: 11.43, Grad Norm: 0.0061 +[2025-02-21 19:36:07] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0900000.pt +[2025-02-21 19:36:15] (step=0900100) Train Loss: 0.1455, Train Steps/Sec: 8.93, Grad Norm: 0.0061 +[2025-02-21 19:36:24] (step=0900200) Train Loss: 0.1454, Train Steps/Sec: 12.20, Grad Norm: 0.0063 +[2025-02-21 19:36:31] (step=0900300) Train Loss: 0.1450, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 19:36:39] (step=0900400) Train Loss: 0.1452, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 19:36:48] (step=0900500) Train Loss: 0.1462, Train Steps/Sec: 11.37, Grad Norm: 0.0058 +[2025-02-21 19:36:58] (step=0900600) Train Loss: 0.1463, Train Steps/Sec: 10.63, Grad Norm: 0.0064 +[2025-02-21 19:37:07] (step=0900700) Train Loss: 0.1458, Train Steps/Sec: 10.61, Grad Norm: 0.0067 +[2025-02-21 19:37:17] (step=0900800) Train Loss: 0.1455, Train Steps/Sec: 10.56, Grad Norm: 0.0059 +[2025-02-21 19:37:25] (step=0900900) Train Loss: 0.1468, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-21 19:37:34] (step=0901000) Train Loss: 0.1447, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 19:37:42] (step=0901100) Train Loss: 0.1462, Train Steps/Sec: 12.18, Grad Norm: 0.0069 +[2025-02-21 19:37:51] (step=0901200) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 19:37:59] (step=0901300) Train Loss: 0.1451, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 19:38:07] (step=0901400) Train Loss: 0.1461, Train Steps/Sec: 12.18, Grad Norm: 0.0063 +[2025-02-21 19:38:15] (step=0901500) Train Loss: 0.1461, Train Steps/Sec: 12.65, Grad Norm: 0.0069 +[2025-02-21 19:38:23] (step=0901600) Train Loss: 0.1452, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 19:38:30] (step=0901700) Train Loss: 0.1440, Train Steps/Sec: 12.66, Grad Norm: 0.0059 +[2025-02-21 19:38:38] (step=0901800) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 19:38:47] (step=0901900) Train Loss: 0.1455, Train Steps/Sec: 12.21, Grad Norm: 0.0059 +[2025-02-21 19:38:55] (step=0902000) Train Loss: 0.1455, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-21 19:39:05] (step=0902100) Train Loss: 0.1458, Train Steps/Sec: 11.02, Grad Norm: 0.0062 +[2025-02-21 19:39:13] (step=0902200) Train Loss: 0.1448, Train Steps/Sec: 12.18, Grad Norm: 0.0065 +[2025-02-21 19:39:21] (step=0902300) Train Loss: 0.1451, Train Steps/Sec: 11.79, Grad Norm: 0.0064 +[2025-02-21 19:39:29] (step=0902400) Train Loss: 0.1442, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 19:39:37] (step=0902500) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 19:39:45] (step=0902600) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 19:39:54] (step=0902700) Train Loss: 0.1459, Train Steps/Sec: 11.02, Grad Norm: 0.0061 +[2025-02-21 19:40:03] (step=0902800) Train Loss: 0.1471, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 19:40:13] (step=0902900) Train Loss: 0.1462, Train Steps/Sec: 10.33, Grad Norm: 0.0061 +[2025-02-21 19:40:21] (step=0903000) Train Loss: 0.1457, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 19:40:30] (step=0903100) Train Loss: 0.1451, Train Steps/Sec: 11.30, Grad Norm: 0.0063 +[2025-02-21 19:40:38] (step=0903200) Train Loss: 0.1464, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 19:40:48] (step=0903300) Train Loss: 0.1456, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-21 19:40:56] (step=0903400) Train Loss: 0.1459, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 19:41:04] (step=0903500) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 19:41:12] (step=0903600) Train Loss: 0.1457, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 19:41:20] (step=0903700) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:41:28] (step=0903800) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 19:41:36] (step=0903900) Train Loss: 0.1455, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 19:41:44] (step=0904000) Train Loss: 0.1471, Train Steps/Sec: 12.21, Grad Norm: 0.0063 +[2025-02-21 19:41:52] (step=0904100) Train Loss: 0.1456, Train Steps/Sec: 12.64, Grad Norm: 0.0065 +[2025-02-21 19:42:01] (step=0904200) Train Loss: 0.1479, Train Steps/Sec: 11.78, Grad Norm: 0.0063 +[2025-02-21 19:42:09] (step=0904300) Train Loss: 0.1464, Train Steps/Sec: 11.41, Grad Norm: 0.0059 +[2025-02-21 19:42:18] (step=0904400) Train Loss: 0.1459, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 19:42:27] (step=0904500) Train Loss: 0.1450, Train Steps/Sec: 10.95, Grad Norm: 0.0063 +[2025-02-21 19:42:35] (step=0904600) Train Loss: 0.1449, Train Steps/Sec: 12.53, Grad Norm: 0.0063 +[2025-02-21 19:42:43] (step=0904700) Train Loss: 0.1462, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 19:42:51] (step=0904800) Train Loss: 0.1452, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 19:43:00] (step=0904900) Train Loss: 0.1451, Train Steps/Sec: 10.96, Grad Norm: 0.0062 +[2025-02-21 19:43:09] (step=0905000) Train Loss: 0.1446, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-21 19:43:19] (step=0905100) Train Loss: 0.1450, Train Steps/Sec: 10.31, Grad Norm: 0.0062 +[2025-02-21 19:43:27] (step=0905200) Train Loss: 0.1460, Train Steps/Sec: 12.20, Grad Norm: 0.0065 +[2025-02-21 19:43:36] (step=0905300) Train Loss: 0.1455, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 19:43:44] (step=0905400) Train Loss: 0.1456, Train Steps/Sec: 12.11, Grad Norm: 0.0060 +[2025-02-21 19:43:53] (step=0905500) Train Loss: 0.1457, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 19:44:01] (step=0905600) Train Loss: 0.1447, Train Steps/Sec: 12.08, Grad Norm: 0.0064 +[2025-02-21 19:44:09] (step=0905700) Train Loss: 0.1460, Train Steps/Sec: 12.50, Grad Norm: 0.0066 +[2025-02-21 19:44:18] (step=0905800) Train Loss: 0.1456, Train Steps/Sec: 10.94, Grad Norm: 0.0062 +[2025-02-21 19:44:26] (step=0905900) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:44:34] (step=0906000) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 19:44:42] (step=0906100) Train Loss: 0.1458, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 19:44:50] (step=0906200) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 19:44:58] (step=0906300) Train Loss: 0.1451, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 19:45:07] (step=0906400) Train Loss: 0.1458, Train Steps/Sec: 11.46, Grad Norm: 0.0059 +[2025-02-21 19:45:16] (step=0906500) Train Loss: 0.1465, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 19:45:24] (step=0906600) Train Loss: 0.1455, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 19:45:32] (step=0906700) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 19:45:40] (step=0906800) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 19:45:48] (step=0906900) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 19:45:57] (step=0907000) Train Loss: 0.1457, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-21 19:46:06] (step=0907100) Train Loss: 0.1452, Train Steps/Sec: 10.57, Grad Norm: 0.0064 +[2025-02-21 19:46:16] (step=0907200) Train Loss: 0.1465, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-21 19:46:25] (step=0907300) Train Loss: 0.1441, Train Steps/Sec: 10.96, Grad Norm: 0.0062 +[2025-02-21 19:46:33] (step=0907400) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 19:46:42] (step=0907500) Train Loss: 0.1453, Train Steps/Sec: 10.95, Grad Norm: 0.0057 +[2025-02-21 19:46:50] (step=0907600) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 19:46:59] (step=0907700) Train Loss: 0.1463, Train Steps/Sec: 11.77, Grad Norm: 0.0058 +[2025-02-21 19:47:07] (step=0907800) Train Loss: 0.1470, Train Steps/Sec: 12.16, Grad Norm: 0.0063 +[2025-02-21 19:47:15] (step=0907900) Train Loss: 0.1450, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 19:47:23] (step=0908000) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 19:47:31] (step=0908100) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 19:47:39] (step=0908200) Train Loss: 0.1461, Train Steps/Sec: 12.55, Grad Norm: 0.0058 +[2025-02-21 19:47:48] (step=0908300) Train Loss: 0.1463, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 19:47:56] (step=0908400) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 19:48:05] (step=0908500) Train Loss: 0.1462, Train Steps/Sec: 11.39, Grad Norm: 0.0059 +[2025-02-21 19:48:13] (step=0908600) Train Loss: 0.1462, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 19:48:22] (step=0908700) Train Loss: 0.1454, Train Steps/Sec: 11.79, Grad Norm: 0.0058 +[2025-02-21 19:48:30] (step=0908800) Train Loss: 0.1455, Train Steps/Sec: 12.65, Grad Norm: 0.0060 +[2025-02-21 19:48:38] (step=0908900) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 19:48:46] (step=0909000) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 19:48:54] (step=0909100) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 19:49:02] (step=0909200) Train Loss: 0.1458, Train Steps/Sec: 12.19, Grad Norm: 0.0059 +[2025-02-21 19:49:11] (step=0909300) Train Loss: 0.1451, Train Steps/Sec: 10.33, Grad Norm: 0.0059 +[2025-02-21 19:49:21] (step=0909400) Train Loss: 0.1457, Train Steps/Sec: 10.94, Grad Norm: 0.0059 +[2025-02-21 19:49:31] (step=0909500) Train Loss: 0.1459, Train Steps/Sec: 9.93, Grad Norm: 0.0063 +[2025-02-21 19:49:39] (step=0909600) Train Loss: 0.1463, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 19:49:48] (step=0909700) Train Loss: 0.1457, Train Steps/Sec: 11.35, Grad Norm: 0.0058 +[2025-02-21 19:49:56] (step=0909800) Train Loss: 0.1459, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 19:50:04] (step=0909900) Train Loss: 0.1451, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 19:50:13] (step=0910000) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0057 +[2025-02-21 19:50:21] (step=0910100) Train Loss: 0.1460, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 19:50:29] (step=0910200) Train Loss: 0.1463, Train Steps/Sec: 12.64, Grad Norm: 0.0061 +[2025-02-21 19:50:37] (step=0910300) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 19:50:45] (step=0910400) Train Loss: 0.1449, Train Steps/Sec: 12.22, Grad Norm: 0.0057 +[2025-02-21 19:50:53] (step=0910500) Train Loss: 0.1454, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 19:51:01] (step=0910600) Train Loss: 0.1460, Train Steps/Sec: 11.78, Grad Norm: 0.0061 +[2025-02-21 19:51:10] (step=0910700) Train Loss: 0.1467, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 19:51:20] (step=0910800) Train Loss: 0.1472, Train Steps/Sec: 10.61, Grad Norm: 0.0060 +[2025-02-21 19:51:27] (step=0910900) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 19:51:35] (step=0911000) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 19:51:43] (step=0911100) Train Loss: 0.1451, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 19:51:51] (step=0911200) Train Loss: 0.1455, Train Steps/Sec: 12.53, Grad Norm: 0.0059 +[2025-02-21 19:51:59] (step=0911300) Train Loss: 0.1452, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 19:52:08] (step=0911400) Train Loss: 0.1462, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 19:52:17] (step=0911500) Train Loss: 0.1445, Train Steps/Sec: 11.01, Grad Norm: 0.0063 +[2025-02-21 19:52:27] (step=0911600) Train Loss: 0.1464, Train Steps/Sec: 10.60, Grad Norm: 0.0064 +[2025-02-21 19:52:35] (step=0911700) Train Loss: 0.1455, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 19:52:44] (step=0911800) Train Loss: 0.1468, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 19:52:53] (step=0911900) Train Loss: 0.1450, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 19:53:02] (step=0912000) Train Loss: 0.1457, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-21 19:53:10] (step=0912100) Train Loss: 0.1472, Train Steps/Sec: 11.73, Grad Norm: 0.0058 +[2025-02-21 19:53:18] (step=0912200) Train Loss: 0.1456, Train Steps/Sec: 12.12, Grad Norm: 0.0063 +[2025-02-21 19:53:27] (step=0912300) Train Loss: 0.1456, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 19:53:35] (step=0912400) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 19:53:43] (step=0912500) Train Loss: 0.1445, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 19:53:51] (step=0912600) Train Loss: 0.1468, Train Steps/Sec: 12.21, Grad Norm: 0.0060 +[2025-02-21 19:53:59] (step=0912700) Train Loss: 0.1460, Train Steps/Sec: 11.77, Grad Norm: 0.0062 +[2025-02-21 19:54:08] (step=0912800) Train Loss: 0.1454, Train Steps/Sec: 11.84, Grad Norm: 0.0064 +[2025-02-21 19:54:16] (step=0912900) Train Loss: 0.1443, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 19:54:24] (step=0913000) Train Loss: 0.1456, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 19:54:32] (step=0913100) Train Loss: 0.1449, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 19:54:40] (step=0913200) Train Loss: 0.1441, Train Steps/Sec: 12.54, Grad Norm: 0.0062 +[2025-02-21 19:54:49] (step=0913300) Train Loss: 0.1459, Train Steps/Sec: 11.36, Grad Norm: 0.0063 +[2025-02-21 19:54:57] (step=0913400) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 19:55:05] (step=0913500) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 19:55:14] (step=0913600) Train Loss: 0.1453, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 19:55:23] (step=0913700) Train Loss: 0.1450, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 19:55:33] (step=0913800) Train Loss: 0.1461, Train Steps/Sec: 10.26, Grad Norm: 0.0061 +[2025-02-21 19:55:41] (step=0913900) Train Loss: 0.1459, Train Steps/Sec: 11.79, Grad Norm: 0.0062 +[2025-02-21 19:55:49] (step=0914000) Train Loss: 0.1455, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 19:55:58] (step=0914100) Train Loss: 0.1450, Train Steps/Sec: 11.38, Grad Norm: 0.0059 +[2025-02-21 19:56:06] (step=0914200) Train Loss: 0.1447, Train Steps/Sec: 12.13, Grad Norm: 0.0061 +[2025-02-21 19:56:15] (step=0914300) Train Loss: 0.1459, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 19:56:23] (step=0914400) Train Loss: 0.1452, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 19:56:32] (step=0914500) Train Loss: 0.1452, Train Steps/Sec: 10.86, Grad Norm: 0.0061 +[2025-02-21 19:56:40] (step=0914600) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 19:56:49] (step=0914700) Train Loss: 0.1453, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 19:56:56] (step=0914800) Train Loss: 0.1479, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 19:57:05] (step=0914900) Train Loss: 0.1454, Train Steps/Sec: 11.40, Grad Norm: 0.0060 +[2025-02-21 19:57:14] (step=0915000) Train Loss: 0.1469, Train Steps/Sec: 11.76, Grad Norm: 0.0058 +[2025-02-21 19:57:22] (step=0915100) Train Loss: 0.1455, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 19:57:30] (step=0915200) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 19:57:38] (step=0915300) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 19:57:46] (step=0915400) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 19:57:54] (step=0915500) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 19:58:02] (step=0915600) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 19:58:10] (step=0915700) Train Loss: 0.1469, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 19:58:20] (step=0915800) Train Loss: 0.1466, Train Steps/Sec: 9.67, Grad Norm: 0.0060 +[2025-02-21 19:58:29] (step=0915900) Train Loss: 0.1463, Train Steps/Sec: 11.38, Grad Norm: 0.0060 +[2025-02-21 19:58:38] (step=0916000) Train Loss: 0.1449, Train Steps/Sec: 10.56, Grad Norm: 0.0060 +[2025-02-21 19:58:47] (step=0916100) Train Loss: 0.1457, Train Steps/Sec: 11.80, Grad Norm: 0.0058 +[2025-02-21 19:58:55] (step=0916200) Train Loss: 0.1460, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 19:59:04] (step=0916300) Train Loss: 0.1452, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 19:59:12] (step=0916400) Train Loss: 0.1455, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 19:59:21] (step=0916500) Train Loss: 0.1448, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 19:59:29] (step=0916600) Train Loss: 0.1459, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 19:59:37] (step=0916700) Train Loss: 0.1458, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 19:59:45] (step=0916800) Train Loss: 0.1454, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 19:59:53] (step=0916900) Train Loss: 0.1466, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 20:00:03] (step=0917000) Train Loss: 0.1458, Train Steps/Sec: 10.54, Grad Norm: 0.0064 +[2025-02-21 20:00:12] (step=0917100) Train Loss: 0.1458, Train Steps/Sec: 11.37, Grad Norm: 0.0059 +[2025-02-21 20:00:20] (step=0917200) Train Loss: 0.1463, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 20:00:28] (step=0917300) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0067 +[2025-02-21 20:00:36] (step=0917400) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0065 +[2025-02-21 20:00:44] (step=0917500) Train Loss: 0.1461, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:00:52] (step=0917600) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 20:01:00] (step=0917700) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 20:01:08] (step=0917800) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 20:01:16] (step=0917900) Train Loss: 0.1453, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 20:01:25] (step=0918000) Train Loss: 0.1460, Train Steps/Sec: 10.58, Grad Norm: 0.0061 +[2025-02-21 20:01:34] (step=0918100) Train Loss: 0.1466, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 20:01:44] (step=0918200) Train Loss: 0.1466, Train Steps/Sec: 10.55, Grad Norm: 0.0060 +[2025-02-21 20:01:53] (step=0918300) Train Loss: 0.1470, Train Steps/Sec: 10.45, Grad Norm: 0.0063 +[2025-02-21 20:02:02] (step=0918400) Train Loss: 0.1461, Train Steps/Sec: 11.66, Grad Norm: 0.0062 +[2025-02-21 20:02:10] (step=0918500) Train Loss: 0.1461, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 20:02:18] (step=0918600) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 20:02:27] (step=0918700) Train Loss: 0.1449, Train Steps/Sec: 11.34, Grad Norm: 0.0059 +[2025-02-21 20:02:35] (step=0918800) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 20:02:43] (step=0918900) Train Loss: 0.1476, Train Steps/Sec: 12.15, Grad Norm: 0.0065 +[2025-02-21 20:02:52] (step=0919000) Train Loss: 0.1454, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 20:03:00] (step=0919100) Train Loss: 0.1447, Train Steps/Sec: 11.77, Grad Norm: 0.0069 +[2025-02-21 20:03:08] (step=0919200) Train Loss: 0.1454, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 20:03:17] (step=0919300) Train Loss: 0.1454, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 20:03:25] (step=0919400) Train Loss: 0.1455, Train Steps/Sec: 11.75, Grad Norm: 0.0062 +[2025-02-21 20:03:34] (step=0919500) Train Loss: 0.1449, Train Steps/Sec: 11.21, Grad Norm: 0.0061 +[2025-02-21 20:03:42] (step=0919600) Train Loss: 0.1468, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:03:50] (step=0919700) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:03:58] (step=0919800) Train Loss: 0.1456, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 20:04:06] (step=0919900) Train Loss: 0.1450, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 20:04:14] (step=0920000) Train Loss: 0.1460, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 20:04:17] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0920000.pt +[2025-02-21 20:04:25] (step=0920100) Train Loss: 0.1451, Train Steps/Sec: 8.95, Grad Norm: 0.0059 +[2025-02-21 20:04:35] (step=0920200) Train Loss: 0.1453, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 20:04:44] (step=0920300) Train Loss: 0.1458, Train Steps/Sec: 10.97, Grad Norm: 0.0060 +[2025-02-21 20:04:53] (step=0920400) Train Loss: 0.1461, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-21 20:05:01] (step=0920500) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:05:10] (step=0920600) Train Loss: 0.1454, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 20:05:18] (step=0920700) Train Loss: 0.1459, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 20:05:28] (step=0920800) Train Loss: 0.1469, Train Steps/Sec: 10.82, Grad Norm: 0.0063 +[2025-02-21 20:05:36] (step=0920900) Train Loss: 0.1471, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 20:05:44] (step=0921000) Train Loss: 0.1453, Train Steps/Sec: 12.64, Grad Norm: 0.0059 +[2025-02-21 20:05:53] (step=0921100) Train Loss: 0.1457, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 20:06:01] (step=0921200) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 20:06:10] (step=0921300) Train Loss: 0.1459, Train Steps/Sec: 11.37, Grad Norm: 0.0062 +[2025-02-21 20:06:18] (step=0921400) Train Loss: 0.1455, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 20:06:27] (step=0921500) Train Loss: 0.1466, Train Steps/Sec: 11.82, Grad Norm: 0.0066 +[2025-02-21 20:06:34] (step=0921600) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:06:42] (step=0921700) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:06:50] (step=0921800) Train Loss: 0.1467, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 20:06:58] (step=0921900) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 20:07:07] (step=0922000) Train Loss: 0.1457, Train Steps/Sec: 11.27, Grad Norm: 0.0060 +[2025-02-21 20:07:15] (step=0922100) Train Loss: 0.1472, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 20:07:23] (step=0922200) Train Loss: 0.1446, Train Steps/Sec: 12.49, Grad Norm: 0.0059 +[2025-02-21 20:07:32] (step=0922300) Train Loss: 0.1452, Train Steps/Sec: 11.72, Grad Norm: 0.0058 +[2025-02-21 20:07:41] (step=0922400) Train Loss: 0.1465, Train Steps/Sec: 10.68, Grad Norm: 0.0060 +[2025-02-21 20:07:50] (step=0922500) Train Loss: 0.1461, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 20:07:59] (step=0922600) Train Loss: 0.1461, Train Steps/Sec: 10.96, Grad Norm: 0.0062 +[2025-02-21 20:08:07] (step=0922700) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 20:08:16] (step=0922800) Train Loss: 0.1447, Train Steps/Sec: 12.03, Grad Norm: 0.0061 +[2025-02-21 20:08:24] (step=0922900) Train Loss: 0.1459, Train Steps/Sec: 11.30, Grad Norm: 0.0062 +[2025-02-21 20:08:33] (step=0923000) Train Loss: 0.1461, Train Steps/Sec: 12.04, Grad Norm: 0.0061 +[2025-02-21 20:08:42] (step=0923100) Train Loss: 0.1457, Train Steps/Sec: 11.17, Grad Norm: 0.0061 +[2025-02-21 20:08:50] (step=0923200) Train Loss: 0.1457, Train Steps/Sec: 12.13, Grad Norm: 0.0065 +[2025-02-21 20:08:59] (step=0923300) Train Loss: 0.1452, Train Steps/Sec: 10.86, Grad Norm: 0.0060 +[2025-02-21 20:09:08] (step=0923400) Train Loss: 0.1448, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 20:09:16] (step=0923500) Train Loss: 0.1444, Train Steps/Sec: 11.36, Grad Norm: 0.0068 +[2025-02-21 20:09:24] (step=0923600) Train Loss: 0.1454, Train Steps/Sec: 12.56, Grad Norm: 0.0057 +[2025-02-21 20:09:33] (step=0923700) Train Loss: 0.1449, Train Steps/Sec: 11.78, Grad Norm: 0.0061 +[2025-02-21 20:09:41] (step=0923800) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 20:09:49] (step=0923900) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 20:09:57] (step=0924000) Train Loss: 0.1453, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 20:10:05] (step=0924100) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 20:10:13] (step=0924200) Train Loss: 0.1454, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 20:10:21] (step=0924300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:10:29] (step=0924400) Train Loss: 0.1438, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:10:38] (step=0924500) Train Loss: 0.1456, Train Steps/Sec: 10.88, Grad Norm: 0.0062 +[2025-02-21 20:10:47] (step=0924600) Train Loss: 0.1450, Train Steps/Sec: 10.97, Grad Norm: 0.0062 +[2025-02-21 20:10:56] (step=0924700) Train Loss: 0.1457, Train Steps/Sec: 10.91, Grad Norm: 0.0065 +[2025-02-21 20:11:05] (step=0924800) Train Loss: 0.1450, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 20:11:13] (step=0924900) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:11:22] (step=0925000) Train Loss: 0.1453, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 20:11:30] (step=0925100) Train Loss: 0.1465, Train Steps/Sec: 11.76, Grad Norm: 0.0062 +[2025-02-21 20:11:39] (step=0925200) Train Loss: 0.1458, Train Steps/Sec: 11.70, Grad Norm: 0.0062 +[2025-02-21 20:11:47] (step=0925300) Train Loss: 0.1455, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 20:11:56] (step=0925400) Train Loss: 0.1453, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 20:12:04] (step=0925500) Train Loss: 0.1451, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 20:12:13] (step=0925600) Train Loss: 0.1457, Train Steps/Sec: 11.41, Grad Norm: 0.0063 +[2025-02-21 20:12:21] (step=0925700) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 20:12:31] (step=0925800) Train Loss: 0.1453, Train Steps/Sec: 10.51, Grad Norm: 0.0061 +[2025-02-21 20:12:38] (step=0925900) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 20:12:46] (step=0926000) Train Loss: 0.1446, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 20:12:54] (step=0926100) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 20:13:02] (step=0926200) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 20:13:10] (step=0926300) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:13:18] (step=0926400) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:13:26] (step=0926500) Train Loss: 0.1463, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 20:13:34] (step=0926600) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 20:13:42] (step=0926700) Train Loss: 0.1449, Train Steps/Sec: 12.09, Grad Norm: 0.0058 +[2025-02-21 20:13:51] (step=0926800) Train Loss: 0.1454, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 20:14:01] (step=0926900) Train Loss: 0.1458, Train Steps/Sec: 10.57, Grad Norm: 0.0064 +[2025-02-21 20:14:11] (step=0927000) Train Loss: 0.1452, Train Steps/Sec: 10.26, Grad Norm: 0.0061 +[2025-02-21 20:14:19] (step=0927100) Train Loss: 0.1445, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 20:14:27] (step=0927200) Train Loss: 0.1467, Train Steps/Sec: 12.14, Grad Norm: 0.0059 +[2025-02-21 20:14:36] (step=0927300) Train Loss: 0.1452, Train Steps/Sec: 11.74, Grad Norm: 0.0060 +[2025-02-21 20:14:44] (step=0927400) Train Loss: 0.1466, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 20:14:53] (step=0927500) Train Loss: 0.1459, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 20:15:01] (step=0927600) Train Loss: 0.1452, Train Steps/Sec: 12.16, Grad Norm: 0.0059 +[2025-02-21 20:15:10] (step=0927700) Train Loss: 0.1459, Train Steps/Sec: 11.37, Grad Norm: 0.0061 +[2025-02-21 20:15:18] (step=0927800) Train Loss: 0.1454, Train Steps/Sec: 11.79, Grad Norm: 0.0062 +[2025-02-21 20:15:27] (step=0927900) Train Loss: 0.1460, Train Steps/Sec: 11.80, Grad Norm: 0.0060 +[2025-02-21 20:15:35] (step=0928000) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 20:15:43] (step=0928100) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:15:51] (step=0928200) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 20:16:00] (step=0928300) Train Loss: 0.1463, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 20:16:08] (step=0928400) Train Loss: 0.1455, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 20:16:15] (step=0928500) Train Loss: 0.1461, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 20:16:23] (step=0928600) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 20:16:31] (step=0928700) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 20:16:39] (step=0928800) Train Loss: 0.1462, Train Steps/Sec: 12.54, Grad Norm: 0.0061 +[2025-02-21 20:16:48] (step=0928900) Train Loss: 0.1462, Train Steps/Sec: 12.01, Grad Norm: 0.0064 +[2025-02-21 20:16:57] (step=0929000) Train Loss: 0.1447, Train Steps/Sec: 10.63, Grad Norm: 0.0058 +[2025-02-21 20:17:06] (step=0929100) Train Loss: 0.1462, Train Steps/Sec: 10.86, Grad Norm: 0.0058 +[2025-02-21 20:17:15] (step=0929200) Train Loss: 0.1442, Train Steps/Sec: 11.30, Grad Norm: 0.0061 +[2025-02-21 20:17:23] (step=0929300) Train Loss: 0.1462, Train Steps/Sec: 12.10, Grad Norm: 0.0062 +[2025-02-21 20:17:32] (step=0929400) Train Loss: 0.1455, Train Steps/Sec: 11.66, Grad Norm: 0.0059 +[2025-02-21 20:17:41] (step=0929500) Train Loss: 0.1454, Train Steps/Sec: 10.83, Grad Norm: 0.0062 +[2025-02-21 20:17:50] (step=0929600) Train Loss: 0.1454, Train Steps/Sec: 10.92, Grad Norm: 0.0059 +[2025-02-21 20:17:59] (step=0929700) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 20:18:07] (step=0929800) Train Loss: 0.1457, Train Steps/Sec: 11.34, Grad Norm: 0.0063 +[2025-02-21 20:18:16] (step=0929900) Train Loss: 0.1459, Train Steps/Sec: 11.36, Grad Norm: 0.0058 +[2025-02-21 20:18:24] (step=0930000) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:18:33] (step=0930100) Train Loss: 0.1451, Train Steps/Sec: 11.79, Grad Norm: 0.0064 +[2025-02-21 20:18:41] (step=0930200) Train Loss: 0.1468, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 20:18:49] (step=0930300) Train Loss: 0.1459, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 20:18:57] (step=0930400) Train Loss: 0.1452, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:19:04] (step=0930500) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 20:19:12] (step=0930600) Train Loss: 0.1436, Train Steps/Sec: 12.57, Grad Norm: 0.0064 +[2025-02-21 20:19:20] (step=0930700) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 20:19:29] (step=0930800) Train Loss: 0.1455, Train Steps/Sec: 11.24, Grad Norm: 0.0062 +[2025-02-21 20:19:37] (step=0930900) Train Loss: 0.1454, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 20:19:45] (step=0931000) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:19:54] (step=0931100) Train Loss: 0.1462, Train Steps/Sec: 10.92, Grad Norm: 0.0059 +[2025-02-21 20:20:04] (step=0931200) Train Loss: 0.1458, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 20:20:13] (step=0931300) Train Loss: 0.1459, Train Steps/Sec: 11.28, Grad Norm: 0.0060 +[2025-02-21 20:20:21] (step=0931400) Train Loss: 0.1456, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 20:20:30] (step=0931500) Train Loss: 0.1455, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 20:20:38] (step=0931600) Train Loss: 0.1468, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 20:20:46] (step=0931700) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 20:20:56] (step=0931800) Train Loss: 0.1460, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 20:21:04] (step=0931900) Train Loss: 0.1454, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 20:21:14] (step=0932000) Train Loss: 0.1463, Train Steps/Sec: 9.95, Grad Norm: 0.0063 +[2025-02-21 20:21:22] (step=0932100) Train Loss: 0.1453, Train Steps/Sec: 11.73, Grad Norm: 0.0061 +[2025-02-21 20:21:31] (step=0932200) Train Loss: 0.1453, Train Steps/Sec: 11.73, Grad Norm: 0.0063 +[2025-02-21 20:21:39] (step=0932300) Train Loss: 0.1456, Train Steps/Sec: 12.57, Grad Norm: 0.0062 +[2025-02-21 20:21:47] (step=0932400) Train Loss: 0.1448, Train Steps/Sec: 12.57, Grad Norm: 0.0063 +[2025-02-21 20:21:55] (step=0932500) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 20:22:03] (step=0932600) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:22:11] (step=0932700) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 20:22:19] (step=0932800) Train Loss: 0.1448, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:22:27] (step=0932900) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:22:35] (step=0933000) Train Loss: 0.1444, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 20:22:42] (step=0933100) Train Loss: 0.1457, Train Steps/Sec: 12.57, Grad Norm: 0.0067 +[2025-02-21 20:22:50] (step=0933200) Train Loss: 0.1465, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 20:23:01] (step=0933300) Train Loss: 0.1476, Train Steps/Sec: 9.39, Grad Norm: 0.0058 +[2025-02-21 20:23:10] (step=0933400) Train Loss: 0.1453, Train Steps/Sec: 11.33, Grad Norm: 0.0060 +[2025-02-21 20:23:19] (step=0933500) Train Loss: 0.1449, Train Steps/Sec: 10.55, Grad Norm: 0.0059 +[2025-02-21 20:23:28] (step=0933600) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 20:23:36] (step=0933700) Train Loss: 0.1450, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 20:23:45] (step=0933800) Train Loss: 0.1475, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 20:23:53] (step=0933900) Train Loss: 0.1467, Train Steps/Sec: 11.73, Grad Norm: 0.0062 +[2025-02-21 20:24:02] (step=0934000) Train Loss: 0.1449, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 20:24:11] (step=0934100) Train Loss: 0.1451, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 20:24:20] (step=0934200) Train Loss: 0.1459, Train Steps/Sec: 10.95, Grad Norm: 0.0062 +[2025-02-21 20:24:28] (step=0934300) Train Loss: 0.1448, Train Steps/Sec: 11.80, Grad Norm: 0.0060 +[2025-02-21 20:24:36] (step=0934400) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:24:45] (step=0934500) Train Loss: 0.1463, Train Steps/Sec: 11.26, Grad Norm: 0.0062 +[2025-02-21 20:24:53] (step=0934600) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 20:25:01] (step=0934700) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:25:09] (step=0934800) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:25:17] (step=0934900) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 20:25:25] (step=0935000) Train Loss: 0.1445, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 20:25:33] (step=0935100) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 20:25:41] (step=0935200) Train Loss: 0.1435, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 20:25:48] (step=0935300) Train Loss: 0.1448, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:25:56] (step=0935400) Train Loss: 0.1477, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 20:26:06] (step=0935500) Train Loss: 0.1459, Train Steps/Sec: 10.32, Grad Norm: 0.0061 +[2025-02-21 20:26:15] (step=0935600) Train Loss: 0.1445, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 20:26:24] (step=0935700) Train Loss: 0.1448, Train Steps/Sec: 10.55, Grad Norm: 0.0061 +[2025-02-21 20:26:33] (step=0935800) Train Loss: 0.1441, Train Steps/Sec: 10.94, Grad Norm: 0.0061 +[2025-02-21 20:26:41] (step=0935900) Train Loss: 0.1450, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 20:26:50] (step=0936000) Train Loss: 0.1449, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 20:26:59] (step=0936100) Train Loss: 0.1442, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 20:27:08] (step=0936200) Train Loss: 0.1460, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 20:27:17] (step=0936300) Train Loss: 0.1454, Train Steps/Sec: 11.01, Grad Norm: 0.0059 +[2025-02-21 20:27:25] (step=0936400) Train Loss: 0.1453, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 20:27:34] (step=0936500) Train Loss: 0.1448, Train Steps/Sec: 11.81, Grad Norm: 0.0062 +[2025-02-21 20:27:42] (step=0936600) Train Loss: 0.1457, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 20:27:50] (step=0936700) Train Loss: 0.1465, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 20:27:58] (step=0936800) Train Loss: 0.1454, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 20:28:06] (step=0936900) Train Loss: 0.1451, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 20:28:15] (step=0937000) Train Loss: 0.1456, Train Steps/Sec: 11.23, Grad Norm: 0.0060 +[2025-02-21 20:28:22] (step=0937100) Train Loss: 0.1453, Train Steps/Sec: 12.54, Grad Norm: 0.0059 +[2025-02-21 20:28:30] (step=0937200) Train Loss: 0.1450, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:28:38] (step=0937300) Train Loss: 0.1441, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 20:28:46] (step=0937400) Train Loss: 0.1447, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 20:28:54] (step=0937500) Train Loss: 0.1445, Train Steps/Sec: 12.60, Grad Norm: 0.0063 +[2025-02-21 20:29:02] (step=0937600) Train Loss: 0.1444, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 20:29:12] (step=0937700) Train Loss: 0.1464, Train Steps/Sec: 10.33, Grad Norm: 0.0063 +[2025-02-21 20:29:20] (step=0937800) Train Loss: 0.1457, Train Steps/Sec: 11.74, Grad Norm: 0.0058 +[2025-02-21 20:29:30] (step=0937900) Train Loss: 0.1453, Train Steps/Sec: 10.25, Grad Norm: 0.0058 +[2025-02-21 20:29:38] (step=0938000) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:29:46] (step=0938100) Train Loss: 0.1467, Train Steps/Sec: 12.18, Grad Norm: 0.0061 +[2025-02-21 20:29:55] (step=0938200) Train Loss: 0.1450, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 20:30:05] (step=0938300) Train Loss: 0.1464, Train Steps/Sec: 10.87, Grad Norm: 0.0058 +[2025-02-21 20:30:14] (step=0938400) Train Loss: 0.1456, Train Steps/Sec: 10.59, Grad Norm: 0.0057 +[2025-02-21 20:30:23] (step=0938500) Train Loss: 0.1456, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 20:30:32] (step=0938600) Train Loss: 0.1455, Train Steps/Sec: 11.79, Grad Norm: 0.0062 +[2025-02-21 20:30:40] (step=0938700) Train Loss: 0.1457, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 20:30:48] (step=0938800) Train Loss: 0.1445, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 20:30:55] (step=0938900) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 20:31:03] (step=0939000) Train Loss: 0.1458, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:31:11] (step=0939100) Train Loss: 0.1445, Train Steps/Sec: 12.56, Grad Norm: 0.0067 +[2025-02-21 20:31:19] (step=0939200) Train Loss: 0.1470, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 20:31:27] (step=0939300) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 20:31:35] (step=0939400) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:31:43] (step=0939500) Train Loss: 0.1450, Train Steps/Sec: 12.50, Grad Norm: 0.0061 +[2025-02-21 20:31:52] (step=0939600) Train Loss: 0.1476, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-21 20:32:00] (step=0939700) Train Loss: 0.1447, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 20:32:08] (step=0939800) Train Loss: 0.1458, Train Steps/Sec: 12.09, Grad Norm: 0.0060 +[2025-02-21 20:32:18] (step=0939900) Train Loss: 0.1458, Train Steps/Sec: 10.65, Grad Norm: 0.0061 +[2025-02-21 20:32:27] (step=0940000) Train Loss: 0.1467, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 20:32:29] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0940000.pt +[2025-02-21 20:32:39] (step=0940100) Train Loss: 0.1470, Train Steps/Sec: 8.50, Grad Norm: 0.0064 +[2025-02-21 20:32:47] (step=0940200) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 20:32:55] (step=0940300) Train Loss: 0.1470, Train Steps/Sec: 12.16, Grad Norm: 0.0060 +[2025-02-21 20:33:04] (step=0940400) Train Loss: 0.1462, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 20:33:13] (step=0940500) Train Loss: 0.1454, Train Steps/Sec: 11.33, Grad Norm: 0.0058 +[2025-02-21 20:33:22] (step=0940600) Train Loss: 0.1468, Train Steps/Sec: 10.96, Grad Norm: 0.0061 +[2025-02-21 20:33:31] (step=0940700) Train Loss: 0.1469, Train Steps/Sec: 10.68, Grad Norm: 0.0060 +[2025-02-21 20:33:40] (step=0940800) Train Loss: 0.1453, Train Steps/Sec: 11.20, Grad Norm: 0.0060 +[2025-02-21 20:33:48] (step=0940900) Train Loss: 0.1466, Train Steps/Sec: 12.56, Grad Norm: 0.0059 +[2025-02-21 20:33:56] (step=0941000) Train Loss: 0.1451, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 20:34:04] (step=0941100) Train Loss: 0.1465, Train Steps/Sec: 12.55, Grad Norm: 0.0061 +[2025-02-21 20:34:12] (step=0941200) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 20:34:20] (step=0941300) Train Loss: 0.1453, Train Steps/Sec: 12.58, Grad Norm: 0.0064 +[2025-02-21 20:34:28] (step=0941400) Train Loss: 0.1454, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 20:34:36] (step=0941500) Train Loss: 0.1445, Train Steps/Sec: 12.61, Grad Norm: 0.0064 +[2025-02-21 20:34:44] (step=0941600) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 20:34:52] (step=0941700) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 20:35:00] (step=0941800) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 20:35:08] (step=0941900) Train Loss: 0.1460, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 20:35:16] (step=0942000) Train Loss: 0.1460, Train Steps/Sec: 12.10, Grad Norm: 0.0059 +[2025-02-21 20:35:26] (step=0942100) Train Loss: 0.1468, Train Steps/Sec: 9.44, Grad Norm: 0.0063 +[2025-02-21 20:35:36] (step=0942200) Train Loss: 0.1445, Train Steps/Sec: 10.90, Grad Norm: 0.0061 +[2025-02-21 20:35:45] (step=0942300) Train Loss: 0.1462, Train Steps/Sec: 10.95, Grad Norm: 0.0063 +[2025-02-21 20:35:53] (step=0942400) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 20:36:02] (step=0942500) Train Loss: 0.1460, Train Steps/Sec: 11.36, Grad Norm: 0.0059 +[2025-02-21 20:36:10] (step=0942600) Train Loss: 0.1454, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 20:36:19] (step=0942700) Train Loss: 0.1462, Train Steps/Sec: 10.90, Grad Norm: 0.0062 +[2025-02-21 20:36:28] (step=0942800) Train Loss: 0.1472, Train Steps/Sec: 10.98, Grad Norm: 0.0060 +[2025-02-21 20:36:37] (step=0942900) Train Loss: 0.1457, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 20:36:45] (step=0943000) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 20:36:53] (step=0943100) Train Loss: 0.1445, Train Steps/Sec: 12.57, Grad Norm: 0.0058 +[2025-02-21 20:37:01] (step=0943200) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:37:10] (step=0943300) Train Loss: 0.1457, Train Steps/Sec: 11.20, Grad Norm: 0.0061 +[2025-02-21 20:37:18] (step=0943400) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 20:37:26] (step=0943500) Train Loss: 0.1445, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 20:37:34] (step=0943600) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 20:37:42] (step=0943700) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:37:50] (step=0943800) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 20:37:58] (step=0943900) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0066 +[2025-02-21 20:38:06] (step=0944000) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 20:38:14] (step=0944100) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 20:38:22] (step=0944200) Train Loss: 0.1468, Train Steps/Sec: 11.69, Grad Norm: 0.0062 +[2025-02-21 20:38:32] (step=0944300) Train Loss: 0.1463, Train Steps/Sec: 10.33, Grad Norm: 0.0061 +[2025-02-21 20:38:41] (step=0944400) Train Loss: 0.1460, Train Steps/Sec: 10.91, Grad Norm: 0.0061 +[2025-02-21 20:38:50] (step=0944500) Train Loss: 0.1466, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-21 20:38:59] (step=0944600) Train Loss: 0.1451, Train Steps/Sec: 10.99, Grad Norm: 0.0065 +[2025-02-21 20:39:08] (step=0944700) Train Loss: 0.1463, Train Steps/Sec: 11.73, Grad Norm: 0.0063 +[2025-02-21 20:39:17] (step=0944800) Train Loss: 0.1451, Train Steps/Sec: 11.03, Grad Norm: 0.0058 +[2025-02-21 20:39:26] (step=0944900) Train Loss: 0.1459, Train Steps/Sec: 10.95, Grad Norm: 0.0060 +[2025-02-21 20:39:35] (step=0945000) Train Loss: 0.1451, Train Steps/Sec: 11.01, Grad Norm: 0.0058 +[2025-02-21 20:39:43] (step=0945100) Train Loss: 0.1450, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 20:39:51] (step=0945200) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 20:39:59] (step=0945300) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 20:40:07] (step=0945400) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 20:40:15] (step=0945500) Train Loss: 0.1448, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 20:40:23] (step=0945600) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 20:40:31] (step=0945700) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 20:40:40] (step=0945800) Train Loss: 0.1453, Train Steps/Sec: 11.23, Grad Norm: 0.0061 +[2025-02-21 20:40:48] (step=0945900) Train Loss: 0.1449, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 20:40:56] (step=0946000) Train Loss: 0.1450, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 20:41:04] (step=0946100) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 20:41:12] (step=0946200) Train Loss: 0.1449, Train Steps/Sec: 12.56, Grad Norm: 0.0064 +[2025-02-21 20:41:20] (step=0946300) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:41:29] (step=0946400) Train Loss: 0.1453, Train Steps/Sec: 10.98, Grad Norm: 0.0061 +[2025-02-21 20:41:38] (step=0946500) Train Loss: 0.1461, Train Steps/Sec: 10.65, Grad Norm: 0.0061 +[2025-02-21 20:41:47] (step=0946600) Train Loss: 0.1450, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-21 20:41:56] (step=0946700) Train Loss: 0.1452, Train Steps/Sec: 10.95, Grad Norm: 0.0064 +[2025-02-21 20:42:04] (step=0946800) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0056 +[2025-02-21 20:42:12] (step=0946900) Train Loss: 0.1456, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 20:42:23] (step=0947000) Train Loss: 0.1457, Train Steps/Sec: 9.72, Grad Norm: 0.0061 +[2025-02-21 20:42:32] (step=0947100) Train Loss: 0.1448, Train Steps/Sec: 10.57, Grad Norm: 0.0058 +[2025-02-21 20:42:41] (step=0947200) Train Loss: 0.1461, Train Steps/Sec: 10.95, Grad Norm: 0.0061 +[2025-02-21 20:42:49] (step=0947300) Train Loss: 0.1454, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 20:42:57] (step=0947400) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0062 +[2025-02-21 20:43:05] (step=0947500) Train Loss: 0.1461, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 20:43:13] (step=0947600) Train Loss: 0.1454, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 20:43:21] (step=0947700) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 20:43:29] (step=0947800) Train Loss: 0.1452, Train Steps/Sec: 12.59, Grad Norm: 0.0067 +[2025-02-21 20:43:37] (step=0947900) Train Loss: 0.1452, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 20:43:45] (step=0948000) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:43:53] (step=0948100) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 20:44:01] (step=0948200) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 20:44:10] (step=0948300) Train Loss: 0.1465, Train Steps/Sec: 11.24, Grad Norm: 0.0058 +[2025-02-21 20:44:18] (step=0948400) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 20:44:26] (step=0948500) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:44:35] (step=0948600) Train Loss: 0.1464, Train Steps/Sec: 10.63, Grad Norm: 0.0060 +[2025-02-21 20:44:44] (step=0948700) Train Loss: 0.1448, Train Steps/Sec: 11.00, Grad Norm: 0.0061 +[2025-02-21 20:44:53] (step=0948800) Train Loss: 0.1462, Train Steps/Sec: 10.92, Grad Norm: 0.0061 +[2025-02-21 20:45:02] (step=0948900) Train Loss: 0.1457, Train Steps/Sec: 11.40, Grad Norm: 0.0059 +[2025-02-21 20:45:10] (step=0949000) Train Loss: 0.1458, Train Steps/Sec: 12.17, Grad Norm: 0.0061 +[2025-02-21 20:45:19] (step=0949100) Train Loss: 0.1445, Train Steps/Sec: 10.99, Grad Norm: 0.0059 +[2025-02-21 20:45:29] (step=0949200) Train Loss: 0.1454, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 20:45:37] (step=0949300) Train Loss: 0.1463, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 20:45:46] (step=0949400) Train Loss: 0.1454, Train Steps/Sec: 11.28, Grad Norm: 0.0058 +[2025-02-21 20:45:54] (step=0949500) Train Loss: 0.1455, Train Steps/Sec: 12.10, Grad Norm: 0.0058 +[2025-02-21 20:46:03] (step=0949600) Train Loss: 0.1450, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 20:46:11] (step=0949700) Train Loss: 0.1459, Train Steps/Sec: 12.60, Grad Norm: 0.0064 +[2025-02-21 20:46:19] (step=0949800) Train Loss: 0.1469, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 20:46:27] (step=0949900) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0057 +[2025-02-21 20:46:35] (step=0950000) Train Loss: 0.1445, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 20:46:43] (step=0950100) Train Loss: 0.1472, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 20:46:51] (step=0950200) Train Loss: 0.1447, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:46:59] (step=0950300) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 20:47:07] (step=0950400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 20:47:15] (step=0950500) Train Loss: 0.1443, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:47:23] (step=0950600) Train Loss: 0.1441, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 20:47:31] (step=0950700) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:47:41] (step=0950800) Train Loss: 0.1449, Train Steps/Sec: 9.17, Grad Norm: 0.0064 +[2025-02-21 20:47:50] (step=0950900) Train Loss: 0.1472, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 20:48:00] (step=0951000) Train Loss: 0.1457, Train Steps/Sec: 10.30, Grad Norm: 0.0060 +[2025-02-21 20:48:08] (step=0951100) Train Loss: 0.1447, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 20:48:17] (step=0951200) Train Loss: 0.1449, Train Steps/Sec: 11.34, Grad Norm: 0.0062 +[2025-02-21 20:48:26] (step=0951300) Train Loss: 0.1463, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 20:48:35] (step=0951400) Train Loss: 0.1465, Train Steps/Sec: 11.43, Grad Norm: 0.0064 +[2025-02-21 20:48:44] (step=0951500) Train Loss: 0.1462, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 20:48:52] (step=0951600) Train Loss: 0.1458, Train Steps/Sec: 11.72, Grad Norm: 0.0062 +[2025-02-21 20:49:00] (step=0951700) Train Loss: 0.1454, Train Steps/Sec: 12.14, Grad Norm: 0.0062 +[2025-02-21 20:49:08] (step=0951800) Train Loss: 0.1460, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 20:49:16] (step=0951900) Train Loss: 0.1446, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 20:49:24] (step=0952000) Train Loss: 0.1456, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 20:49:33] (step=0952100) Train Loss: 0.1450, Train Steps/Sec: 11.29, Grad Norm: 0.0064 +[2025-02-21 20:49:41] (step=0952200) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 20:49:49] (step=0952300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:49:57] (step=0952400) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:50:05] (step=0952500) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:50:13] (step=0952600) Train Loss: 0.1443, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 20:50:20] (step=0952700) Train Loss: 0.1470, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 20:50:28] (step=0952800) Train Loss: 0.1452, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:50:37] (step=0952900) Train Loss: 0.1455, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 20:50:46] (step=0953000) Train Loss: 0.1457, Train Steps/Sec: 10.36, Grad Norm: 0.0057 +[2025-02-21 20:50:55] (step=0953100) Train Loss: 0.1443, Train Steps/Sec: 11.35, Grad Norm: 0.0066 +[2025-02-21 20:51:05] (step=0953200) Train Loss: 0.1463, Train Steps/Sec: 10.58, Grad Norm: 0.0064 +[2025-02-21 20:51:14] (step=0953300) Train Loss: 0.1459, Train Steps/Sec: 10.55, Grad Norm: 0.0058 +[2025-02-21 20:51:23] (step=0953400) Train Loss: 0.1470, Train Steps/Sec: 10.67, Grad Norm: 0.0064 +[2025-02-21 20:51:32] (step=0953500) Train Loss: 0.1449, Train Steps/Sec: 11.36, Grad Norm: 0.0062 +[2025-02-21 20:51:41] (step=0953600) Train Loss: 0.1451, Train Steps/Sec: 11.35, Grad Norm: 0.0061 +[2025-02-21 20:51:49] (step=0953700) Train Loss: 0.1447, Train Steps/Sec: 12.11, Grad Norm: 0.0063 +[2025-02-21 20:51:58] (step=0953800) Train Loss: 0.1456, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 20:52:06] (step=0953900) Train Loss: 0.1444, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 20:52:14] (step=0954000) Train Loss: 0.1473, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 20:52:22] (step=0954100) Train Loss: 0.1461, Train Steps/Sec: 12.58, Grad Norm: 0.0060 +[2025-02-21 20:52:30] (step=0954200) Train Loss: 0.1466, Train Steps/Sec: 12.57, Grad Norm: 0.0060 +[2025-02-21 20:52:38] (step=0954300) Train Loss: 0.1468, Train Steps/Sec: 12.58, Grad Norm: 0.0062 +[2025-02-21 20:52:46] (step=0954400) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:52:54] (step=0954500) Train Loss: 0.1462, Train Steps/Sec: 12.56, Grad Norm: 0.0062 +[2025-02-21 20:53:03] (step=0954600) Train Loss: 0.1447, Train Steps/Sec: 11.32, Grad Norm: 0.0059 +[2025-02-21 20:53:11] (step=0954700) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0067 +[2025-02-21 20:53:18] (step=0954800) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 20:53:26] (step=0954900) Train Loss: 0.1464, Train Steps/Sec: 12.58, Grad Norm: 0.0066 +[2025-02-21 20:53:34] (step=0955000) Train Loss: 0.1452, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 20:53:43] (step=0955100) Train Loss: 0.1455, Train Steps/Sec: 11.35, Grad Norm: 0.0057 +[2025-02-21 20:53:53] (step=0955200) Train Loss: 0.1463, Train Steps/Sec: 10.67, Grad Norm: 0.0062 +[2025-02-21 20:54:01] (step=0955300) Train Loss: 0.1457, Train Steps/Sec: 11.32, Grad Norm: 0.0060 +[2025-02-21 20:54:11] (step=0955400) Train Loss: 0.1448, Train Steps/Sec: 10.61, Grad Norm: 0.0060 +[2025-02-21 20:54:19] (step=0955500) Train Loss: 0.1465, Train Steps/Sec: 11.74, Grad Norm: 0.0064 +[2025-02-21 20:54:28] (step=0955600) Train Loss: 0.1462, Train Steps/Sec: 10.99, Grad Norm: 0.0060 +[2025-02-21 20:54:38] (step=0955700) Train Loss: 0.1457, Train Steps/Sec: 11.00, Grad Norm: 0.0060 +[2025-02-21 20:54:47] (step=0955800) Train Loss: 0.1451, Train Steps/Sec: 10.54, Grad Norm: 0.0064 +[2025-02-21 20:54:55] (step=0955900) Train Loss: 0.1454, Train Steps/Sec: 12.11, Grad Norm: 0.0062 +[2025-02-21 20:55:04] (step=0956000) Train Loss: 0.1454, Train Steps/Sec: 11.70, Grad Norm: 0.0065 +[2025-02-21 20:55:12] (step=0956100) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 20:55:20] (step=0956200) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 20:55:28] (step=0956300) Train Loss: 0.1458, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:55:36] (step=0956400) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 20:55:44] (step=0956500) Train Loss: 0.1467, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 20:55:52] (step=0956600) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 20:56:00] (step=0956700) Train Loss: 0.1471, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 20:56:08] (step=0956800) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 20:56:16] (step=0956900) Train Loss: 0.1463, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 20:56:24] (step=0957000) Train Loss: 0.1450, Train Steps/Sec: 12.56, Grad Norm: 0.0060 +[2025-02-21 20:56:32] (step=0957100) Train Loss: 0.1459, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 20:56:40] (step=0957200) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 20:56:49] (step=0957300) Train Loss: 0.1473, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 20:56:59] (step=0957400) Train Loss: 0.1453, Train Steps/Sec: 10.39, Grad Norm: 0.0060 +[2025-02-21 20:57:08] (step=0957500) Train Loss: 0.1459, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 20:57:17] (step=0957600) Train Loss: 0.1455, Train Steps/Sec: 10.31, Grad Norm: 0.0062 +[2025-02-21 20:57:26] (step=0957700) Train Loss: 0.1457, Train Steps/Sec: 11.36, Grad Norm: 0.0061 +[2025-02-21 20:57:35] (step=0957800) Train Loss: 0.1450, Train Steps/Sec: 11.43, Grad Norm: 0.0057 +[2025-02-21 20:57:43] (step=0957900) Train Loss: 0.1450, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 20:57:52] (step=0958000) Train Loss: 0.1470, Train Steps/Sec: 12.19, Grad Norm: 0.0063 +[2025-02-21 20:58:00] (step=0958100) Train Loss: 0.1455, Train Steps/Sec: 11.67, Grad Norm: 0.0061 +[2025-02-21 20:58:09] (step=0958200) Train Loss: 0.1447, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 20:58:18] (step=0958300) Train Loss: 0.1477, Train Steps/Sec: 10.90, Grad Norm: 0.0059 +[2025-02-21 20:58:26] (step=0958400) Train Loss: 0.1475, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 20:58:34] (step=0958500) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 20:58:42] (step=0958600) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 20:58:50] (step=0958700) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 20:58:58] (step=0958800) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 20:59:05] (step=0958900) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 20:59:13] (step=0959000) Train Loss: 0.1457, Train Steps/Sec: 12.63, Grad Norm: 0.0056 +[2025-02-21 20:59:21] (step=0959100) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 20:59:29] (step=0959200) Train Loss: 0.1456, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 20:59:37] (step=0959300) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 20:59:45] (step=0959400) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 20:59:54] (step=0959500) Train Loss: 0.1447, Train Steps/Sec: 11.00, Grad Norm: 0.0059 +[2025-02-21 21:00:05] (step=0959600) Train Loss: 0.1458, Train Steps/Sec: 9.39, Grad Norm: 0.0064 +[2025-02-21 21:00:14] (step=0959700) Train Loss: 0.1476, Train Steps/Sec: 10.61, Grad Norm: 0.0061 +[2025-02-21 21:00:24] (step=0959800) Train Loss: 0.1466, Train Steps/Sec: 10.64, Grad Norm: 0.0058 +[2025-02-21 21:00:32] (step=0959900) Train Loss: 0.1469, Train Steps/Sec: 11.78, Grad Norm: 0.0062 +[2025-02-21 21:00:41] (step=0960000) Train Loss: 0.1456, Train Steps/Sec: 11.78, Grad Norm: 0.0060 +[2025-02-21 21:00:43] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0960000.pt +[2025-02-21 21:00:52] (step=0960100) Train Loss: 0.1450, Train Steps/Sec: 9.06, Grad Norm: 0.0059 +[2025-02-21 21:01:00] (step=0960200) Train Loss: 0.1460, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 21:01:08] (step=0960300) Train Loss: 0.1461, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 21:01:17] (step=0960400) Train Loss: 0.1466, Train Steps/Sec: 11.75, Grad Norm: 0.0064 +[2025-02-21 21:01:25] (step=0960500) Train Loss: 0.1463, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 21:01:33] (step=0960600) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 21:01:41] (step=0960700) Train Loss: 0.1449, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 21:01:50] (step=0960800) Train Loss: 0.1453, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 21:01:58] (step=0960900) Train Loss: 0.1468, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:02:05] (step=0961000) Train Loss: 0.1466, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:02:13] (step=0961100) Train Loss: 0.1460, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:02:21] (step=0961200) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 21:02:29] (step=0961300) Train Loss: 0.1459, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 21:02:37] (step=0961400) Train Loss: 0.1462, Train Steps/Sec: 12.58, Grad Norm: 0.0065 +[2025-02-21 21:02:45] (step=0961500) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:02:53] (step=0961600) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 21:03:02] (step=0961700) Train Loss: 0.1454, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 21:03:12] (step=0961800) Train Loss: 0.1464, Train Steps/Sec: 10.99, Grad Norm: 0.0061 +[2025-02-21 21:03:21] (step=0961900) Train Loss: 0.1463, Train Steps/Sec: 10.25, Grad Norm: 0.0061 +[2025-02-21 21:03:31] (step=0962000) Train Loss: 0.1454, Train Steps/Sec: 10.62, Grad Norm: 0.0058 +[2025-02-21 21:03:40] (step=0962100) Train Loss: 0.1460, Train Steps/Sec: 10.66, Grad Norm: 0.0060 +[2025-02-21 21:03:48] (step=0962200) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0066 +[2025-02-21 21:03:57] (step=0962300) Train Loss: 0.1453, Train Steps/Sec: 11.35, Grad Norm: 0.0063 +[2025-02-21 21:04:05] (step=0962400) Train Loss: 0.1447, Train Steps/Sec: 12.13, Grad Norm: 0.0060 +[2025-02-21 21:04:14] (step=0962500) Train Loss: 0.1461, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 21:04:23] (step=0962600) Train Loss: 0.1455, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 21:04:30] (step=0962700) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 21:04:38] (step=0962800) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 21:04:46] (step=0962900) Train Loss: 0.1457, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 21:04:54] (step=0963000) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 21:05:02] (step=0963100) Train Loss: 0.1453, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 21:05:10] (step=0963200) Train Loss: 0.1453, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 21:05:19] (step=0963300) Train Loss: 0.1457, Train Steps/Sec: 11.24, Grad Norm: 0.0060 +[2025-02-21 21:05:27] (step=0963400) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:05:35] (step=0963500) Train Loss: 0.1440, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:05:43] (step=0963600) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 21:05:51] (step=0963700) Train Loss: 0.1443, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:05:59] (step=0963800) Train Loss: 0.1459, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 21:06:08] (step=0963900) Train Loss: 0.1455, Train Steps/Sec: 10.67, Grad Norm: 0.0061 +[2025-02-21 21:06:18] (step=0964000) Train Loss: 0.1444, Train Steps/Sec: 10.02, Grad Norm: 0.0061 +[2025-02-21 21:06:28] (step=0964100) Train Loss: 0.1454, Train Steps/Sec: 10.00, Grad Norm: 0.0060 +[2025-02-21 21:06:37] (step=0964200) Train Loss: 0.1451, Train Steps/Sec: 11.40, Grad Norm: 0.0061 +[2025-02-21 21:06:45] (step=0964300) Train Loss: 0.1458, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:06:53] (step=0964400) Train Loss: 0.1458, Train Steps/Sec: 12.18, Grad Norm: 0.0064 +[2025-02-21 21:07:02] (step=0964500) Train Loss: 0.1452, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 21:07:11] (step=0964600) Train Loss: 0.1459, Train Steps/Sec: 10.93, Grad Norm: 0.0061 +[2025-02-21 21:07:20] (step=0964700) Train Loss: 0.1454, Train Steps/Sec: 11.67, Grad Norm: 0.0059 +[2025-02-21 21:07:28] (step=0964800) Train Loss: 0.1458, Train Steps/Sec: 11.69, Grad Norm: 0.0064 +[2025-02-21 21:07:36] (step=0964900) Train Loss: 0.1454, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 21:07:44] (step=0965000) Train Loss: 0.1462, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:07:52] (step=0965100) Train Loss: 0.1445, Train Steps/Sec: 12.64, Grad Norm: 0.0064 +[2025-02-21 21:08:00] (step=0965200) Train Loss: 0.1459, Train Steps/Sec: 12.64, Grad Norm: 0.0060 +[2025-02-21 21:08:08] (step=0965300) Train Loss: 0.1455, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 21:08:16] (step=0965400) Train Loss: 0.1446, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:08:24] (step=0965500) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:08:32] (step=0965600) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:08:40] (step=0965700) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:08:49] (step=0965800) Train Loss: 0.1464, Train Steps/Sec: 11.27, Grad Norm: 0.0059 +[2025-02-21 21:08:57] (step=0965900) Train Loss: 0.1445, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 21:09:05] (step=0966000) Train Loss: 0.1465, Train Steps/Sec: 12.16, Grad Norm: 0.0064 +[2025-02-21 21:09:15] (step=0966100) Train Loss: 0.1479, Train Steps/Sec: 10.00, Grad Norm: 0.0063 +[2025-02-21 21:09:24] (step=0966200) Train Loss: 0.1462, Train Steps/Sec: 10.63, Grad Norm: 0.0060 +[2025-02-21 21:09:34] (step=0966300) Train Loss: 0.1454, Train Steps/Sec: 10.01, Grad Norm: 0.0059 +[2025-02-21 21:09:43] (step=0966400) Train Loss: 0.1463, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 21:09:51] (step=0966500) Train Loss: 0.1455, Train Steps/Sec: 12.58, Grad Norm: 0.0063 +[2025-02-21 21:09:59] (step=0966600) Train Loss: 0.1451, Train Steps/Sec: 11.71, Grad Norm: 0.0060 +[2025-02-21 21:10:08] (step=0966700) Train Loss: 0.1464, Train Steps/Sec: 11.77, Grad Norm: 0.0059 +[2025-02-21 21:10:16] (step=0966800) Train Loss: 0.1451, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 21:10:24] (step=0966900) Train Loss: 0.1451, Train Steps/Sec: 11.69, Grad Norm: 0.0058 +[2025-02-21 21:10:33] (step=0967000) Train Loss: 0.1445, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-21 21:10:42] (step=0967100) Train Loss: 0.1462, Train Steps/Sec: 11.29, Grad Norm: 0.0060 +[2025-02-21 21:10:50] (step=0967200) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 21:10:58] (step=0967300) Train Loss: 0.1466, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 21:11:06] (step=0967400) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 21:11:14] (step=0967500) Train Loss: 0.1446, Train Steps/Sec: 12.64, Grad Norm: 0.0067 +[2025-02-21 21:11:22] (step=0967600) Train Loss: 0.1441, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:11:29] (step=0967700) Train Loss: 0.1474, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 21:11:37] (step=0967800) Train Loss: 0.1465, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 21:11:45] (step=0967900) Train Loss: 0.1457, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 21:11:53] (step=0968000) Train Loss: 0.1443, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 21:12:01] (step=0968100) Train Loss: 0.1454, Train Steps/Sec: 12.21, Grad Norm: 0.0060 +[2025-02-21 21:12:10] (step=0968200) Train Loss: 0.1470, Train Steps/Sec: 12.20, Grad Norm: 0.0064 +[2025-02-21 21:12:21] (step=0968300) Train Loss: 0.1460, Train Steps/Sec: 8.72, Grad Norm: 0.0062 +[2025-02-21 21:12:31] (step=0968400) Train Loss: 0.1449, Train Steps/Sec: 10.20, Grad Norm: 0.0060 +[2025-02-21 21:12:41] (step=0968500) Train Loss: 0.1459, Train Steps/Sec: 10.31, Grad Norm: 0.0059 +[2025-02-21 21:12:49] (step=0968600) Train Loss: 0.1454, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 21:12:57] (step=0968700) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 21:13:05] (step=0968800) Train Loss: 0.1465, Train Steps/Sec: 11.66, Grad Norm: 0.0061 +[2025-02-21 21:13:14] (step=0968900) Train Loss: 0.1451, Train Steps/Sec: 11.73, Grad Norm: 0.0059 +[2025-02-21 21:13:22] (step=0969000) Train Loss: 0.1460, Train Steps/Sec: 12.15, Grad Norm: 0.0061 +[2025-02-21 21:13:30] (step=0969100) Train Loss: 0.1452, Train Steps/Sec: 11.73, Grad Norm: 0.0063 +[2025-02-21 21:13:39] (step=0969200) Train Loss: 0.1451, Train Steps/Sec: 11.70, Grad Norm: 0.0057 +[2025-02-21 21:13:47] (step=0969300) Train Loss: 0.1451, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 21:13:55] (step=0969400) Train Loss: 0.1475, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 21:14:03] (step=0969500) Train Loss: 0.1458, Train Steps/Sec: 12.50, Grad Norm: 0.0061 +[2025-02-21 21:14:12] (step=0969600) Train Loss: 0.1469, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 21:14:20] (step=0969700) Train Loss: 0.1453, Train Steps/Sec: 12.58, Grad Norm: 0.0057 +[2025-02-21 21:14:28] (step=0969800) Train Loss: 0.1442, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 21:14:35] (step=0969900) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0068 +[2025-02-21 21:14:43] (step=0970000) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 21:14:51] (step=0970100) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0063 +[2025-02-21 21:15:00] (step=0970200) Train Loss: 0.1457, Train Steps/Sec: 12.20, Grad Norm: 0.0063 +[2025-02-21 21:15:07] (step=0970300) Train Loss: 0.1473, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 21:15:17] (step=0970400) Train Loss: 0.1464, Train Steps/Sec: 11.03, Grad Norm: 0.0065 +[2025-02-21 21:15:27] (step=0970500) Train Loss: 0.1456, Train Steps/Sec: 9.49, Grad Norm: 0.0059 +[2025-02-21 21:15:36] (step=0970600) Train Loss: 0.1456, Train Steps/Sec: 10.58, Grad Norm: 0.0059 +[2025-02-21 21:15:46] (step=0970700) Train Loss: 0.1460, Train Steps/Sec: 10.95, Grad Norm: 0.0058 +[2025-02-21 21:15:55] (step=0970800) Train Loss: 0.1462, Train Steps/Sec: 11.21, Grad Norm: 0.0063 +[2025-02-21 21:16:02] (step=0970900) Train Loss: 0.1456, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 21:16:11] (step=0971000) Train Loss: 0.1467, Train Steps/Sec: 11.35, Grad Norm: 0.0059 +[2025-02-21 21:16:20] (step=0971100) Train Loss: 0.1449, Train Steps/Sec: 12.17, Grad Norm: 0.0058 +[2025-02-21 21:16:28] (step=0971200) Train Loss: 0.1470, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 21:16:36] (step=0971300) Train Loss: 0.1463, Train Steps/Sec: 11.70, Grad Norm: 0.0068 +[2025-02-21 21:16:45] (step=0971400) Train Loss: 0.1455, Train Steps/Sec: 11.70, Grad Norm: 0.0060 +[2025-02-21 21:16:53] (step=0971500) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:17:01] (step=0971600) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 21:17:09] (step=0971700) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:17:17] (step=0971800) Train Loss: 0.1470, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:17:25] (step=0971900) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 21:17:33] (step=0972000) Train Loss: 0.1454, Train Steps/Sec: 12.53, Grad Norm: 0.0058 +[2025-02-21 21:17:41] (step=0972100) Train Loss: 0.1455, Train Steps/Sec: 11.24, Grad Norm: 0.0061 +[2025-02-21 21:17:49] (step=0972200) Train Loss: 0.1451, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:17:57] (step=0972300) Train Loss: 0.1469, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 21:18:06] (step=0972400) Train Loss: 0.1460, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 21:18:14] (step=0972500) Train Loss: 0.1462, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 21:18:23] (step=0972600) Train Loss: 0.1455, Train Steps/Sec: 10.98, Grad Norm: 0.0059 +[2025-02-21 21:18:34] (step=0972700) Train Loss: 0.1439, Train Steps/Sec: 9.26, Grad Norm: 0.0059 +[2025-02-21 21:18:43] (step=0972800) Train Loss: 0.1463, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-21 21:18:52] (step=0972900) Train Loss: 0.1441, Train Steps/Sec: 10.98, Grad Norm: 0.0067 +[2025-02-21 21:19:00] (step=0973000) Train Loss: 0.1459, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 21:19:08] (step=0973100) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0059 +[2025-02-21 21:19:16] (step=0973200) Train Loss: 0.1465, Train Steps/Sec: 11.34, Grad Norm: 0.0065 +[2025-02-21 21:19:26] (step=0973300) Train Loss: 0.1448, Train Steps/Sec: 10.91, Grad Norm: 0.0060 +[2025-02-21 21:19:34] (step=0973400) Train Loss: 0.1464, Train Steps/Sec: 12.15, Grad Norm: 0.0058 +[2025-02-21 21:19:42] (step=0973500) Train Loss: 0.1460, Train Steps/Sec: 11.65, Grad Norm: 0.0060 +[2025-02-21 21:19:51] (step=0973600) Train Loss: 0.1461, Train Steps/Sec: 11.68, Grad Norm: 0.0061 +[2025-02-21 21:19:59] (step=0973700) Train Loss: 0.1447, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 21:20:07] (step=0973800) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:20:15] (step=0973900) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:20:23] (step=0974000) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0058 +[2025-02-21 21:20:31] (step=0974100) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0064 +[2025-02-21 21:20:39] (step=0974200) Train Loss: 0.1454, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 21:20:46] (step=0974300) Train Loss: 0.1462, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 21:20:54] (step=0974400) Train Loss: 0.1449, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 21:21:03] (step=0974500) Train Loss: 0.1467, Train Steps/Sec: 12.12, Grad Norm: 0.0062 +[2025-02-21 21:21:12] (step=0974600) Train Loss: 0.1454, Train Steps/Sec: 11.26, Grad Norm: 0.0069 +[2025-02-21 21:21:20] (step=0974700) Train Loss: 0.1445, Train Steps/Sec: 11.72, Grad Norm: 0.0060 +[2025-02-21 21:21:30] (step=0974800) Train Loss: 0.1468, Train Steps/Sec: 9.71, Grad Norm: 0.0058 +[2025-02-21 21:21:40] (step=0974900) Train Loss: 0.1473, Train Steps/Sec: 10.64, Grad Norm: 0.0061 +[2025-02-21 21:21:49] (step=0975000) Train Loss: 0.1451, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-21 21:21:58] (step=0975100) Train Loss: 0.1455, Train Steps/Sec: 10.94, Grad Norm: 0.0060 +[2025-02-21 21:22:06] (step=0975200) Train Loss: 0.1455, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:22:14] (step=0975300) Train Loss: 0.1461, Train Steps/Sec: 12.59, Grad Norm: 0.0057 +[2025-02-21 21:22:23] (step=0975400) Train Loss: 0.1458, Train Steps/Sec: 11.34, Grad Norm: 0.0060 +[2025-02-21 21:22:31] (step=0975500) Train Loss: 0.1457, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 21:22:39] (step=0975600) Train Loss: 0.1441, Train Steps/Sec: 12.13, Grad Norm: 0.0062 +[2025-02-21 21:22:48] (step=0975700) Train Loss: 0.1460, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-21 21:22:57] (step=0975800) Train Loss: 0.1464, Train Steps/Sec: 10.44, Grad Norm: 0.0061 +[2025-02-21 21:23:05] (step=0975900) Train Loss: 0.1476, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 21:23:13] (step=0976000) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:23:21] (step=0976100) Train Loss: 0.1469, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:23:29] (step=0976200) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 21:23:37] (step=0976300) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:23:45] (step=0976400) Train Loss: 0.1449, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 21:23:53] (step=0976500) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 21:24:01] (step=0976600) Train Loss: 0.1453, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 21:24:09] (step=0976700) Train Loss: 0.1443, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:24:17] (step=0976800) Train Loss: 0.1451, Train Steps/Sec: 12.14, Grad Norm: 0.0065 +[2025-02-21 21:24:26] (step=0976900) Train Loss: 0.1449, Train Steps/Sec: 11.02, Grad Norm: 0.0065 +[2025-02-21 21:24:36] (step=0977000) Train Loss: 0.1459, Train Steps/Sec: 10.01, Grad Norm: 0.0060 +[2025-02-21 21:24:47] (step=0977100) Train Loss: 0.1456, Train Steps/Sec: 9.59, Grad Norm: 0.0059 +[2025-02-21 21:24:55] (step=0977200) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 21:25:04] (step=0977300) Train Loss: 0.1456, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-21 21:25:12] (step=0977400) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:25:20] (step=0977500) Train Loss: 0.1456, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:25:29] (step=0977600) Train Loss: 0.1450, Train Steps/Sec: 11.31, Grad Norm: 0.0055 +[2025-02-21 21:25:37] (step=0977700) Train Loss: 0.1451, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 21:25:45] (step=0977800) Train Loss: 0.1464, Train Steps/Sec: 12.14, Grad Norm: 0.0060 +[2025-02-21 21:25:54] (step=0977900) Train Loss: 0.1452, Train Steps/Sec: 10.93, Grad Norm: 0.0060 +[2025-02-21 21:26:02] (step=0978000) Train Loss: 0.1449, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 21:26:10] (step=0978100) Train Loss: 0.1463, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 21:26:18] (step=0978200) Train Loss: 0.1456, Train Steps/Sec: 12.56, Grad Norm: 0.0058 +[2025-02-21 21:26:27] (step=0978300) Train Loss: 0.1461, Train Steps/Sec: 11.24, Grad Norm: 0.0065 +[2025-02-21 21:26:35] (step=0978400) Train Loss: 0.1460, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 21:26:43] (step=0978500) Train Loss: 0.1460, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:26:51] (step=0978600) Train Loss: 0.1452, Train Steps/Sec: 12.60, Grad Norm: 0.0058 +[2025-02-21 21:26:59] (step=0978700) Train Loss: 0.1461, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:27:07] (step=0978800) Train Loss: 0.1472, Train Steps/Sec: 12.20, Grad Norm: 0.0060 +[2025-02-21 21:27:15] (step=0978900) Train Loss: 0.1450, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 21:27:24] (step=0979000) Train Loss: 0.1453, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 21:27:33] (step=0979100) Train Loss: 0.1451, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 21:27:43] (step=0979200) Train Loss: 0.1471, Train Steps/Sec: 9.76, Grad Norm: 0.0061 +[2025-02-21 21:27:52] (step=0979300) Train Loss: 0.1453, Train Steps/Sec: 11.20, Grad Norm: 0.0059 +[2025-02-21 21:28:00] (step=0979400) Train Loss: 0.1450, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 21:28:09] (step=0979500) Train Loss: 0.1456, Train Steps/Sec: 11.27, Grad Norm: 0.0058 +[2025-02-21 21:28:18] (step=0979600) Train Loss: 0.1455, Train Steps/Sec: 11.20, Grad Norm: 0.0059 +[2025-02-21 21:28:26] (step=0979700) Train Loss: 0.1456, Train Steps/Sec: 12.57, Grad Norm: 0.0059 +[2025-02-21 21:28:35] (step=0979800) Train Loss: 0.1457, Train Steps/Sec: 11.31, Grad Norm: 0.0059 +[2025-02-21 21:28:43] (step=0979900) Train Loss: 0.1445, Train Steps/Sec: 12.11, Grad Norm: 0.0061 +[2025-02-21 21:28:51] (step=0980000) Train Loss: 0.1472, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 21:28:55] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/0980000.pt +[2025-02-21 21:29:04] (step=0980100) Train Loss: 0.1462, Train Steps/Sec: 7.66, Grad Norm: 0.0060 +[2025-02-21 21:29:12] (step=0980200) Train Loss: 0.1468, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:29:20] (step=0980300) Train Loss: 0.1459, Train Steps/Sec: 12.63, Grad Norm: 0.0057 +[2025-02-21 21:29:28] (step=0980400) Train Loss: 0.1453, Train Steps/Sec: 12.64, Grad Norm: 0.0063 +[2025-02-21 21:29:36] (step=0980500) Train Loss: 0.1448, Train Steps/Sec: 12.64, Grad Norm: 0.0057 +[2025-02-21 21:29:44] (step=0980600) Train Loss: 0.1442, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 21:29:52] (step=0980700) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0057 +[2025-02-21 21:30:01] (step=0980800) Train Loss: 0.1465, Train Steps/Sec: 11.14, Grad Norm: 0.0062 +[2025-02-21 21:30:09] (step=0980900) Train Loss: 0.1453, Train Steps/Sec: 12.19, Grad Norm: 0.0060 +[2025-02-21 21:30:17] (step=0981000) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0060 +[2025-02-21 21:30:26] (step=0981100) Train Loss: 0.1451, Train Steps/Sec: 11.76, Grad Norm: 0.0065 +[2025-02-21 21:30:34] (step=0981200) Train Loss: 0.1429, Train Steps/Sec: 11.36, Grad Norm: 0.0060 +[2025-02-21 21:30:43] (step=0981300) Train Loss: 0.1455, Train Steps/Sec: 11.42, Grad Norm: 0.0061 +[2025-02-21 21:30:53] (step=0981400) Train Loss: 0.1463, Train Steps/Sec: 10.35, Grad Norm: 0.0062 +[2025-02-21 21:31:02] (step=0981500) Train Loss: 0.1451, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 21:31:11] (step=0981600) Train Loss: 0.1452, Train Steps/Sec: 10.64, Grad Norm: 0.0060 +[2025-02-21 21:31:19] (step=0981700) Train Loss: 0.1455, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 21:31:27] (step=0981800) Train Loss: 0.1468, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 21:31:35] (step=0981900) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 21:31:44] (step=0982000) Train Loss: 0.1470, Train Steps/Sec: 11.33, Grad Norm: 0.0062 +[2025-02-21 21:31:53] (step=0982100) Train Loss: 0.1450, Train Steps/Sec: 10.78, Grad Norm: 0.0063 +[2025-02-21 21:32:01] (step=0982200) Train Loss: 0.1468, Train Steps/Sec: 11.66, Grad Norm: 0.0058 +[2025-02-21 21:32:10] (step=0982300) Train Loss: 0.1456, Train Steps/Sec: 11.28, Grad Norm: 0.0058 +[2025-02-21 21:32:18] (step=0982400) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:32:26] (step=0982500) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 21:32:34] (step=0982600) Train Loss: 0.1451, Train Steps/Sec: 12.62, Grad Norm: 0.0065 +[2025-02-21 21:32:42] (step=0982700) Train Loss: 0.1452, Train Steps/Sec: 12.61, Grad Norm: 0.0060 +[2025-02-21 21:32:50] (step=0982800) Train Loss: 0.1457, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:32:58] (step=0982900) Train Loss: 0.1459, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:33:06] (step=0983000) Train Loss: 0.1455, Train Steps/Sec: 12.19, Grad Norm: 0.0058 +[2025-02-21 21:33:14] (step=0983100) Train Loss: 0.1446, Train Steps/Sec: 12.58, Grad Norm: 0.0061 +[2025-02-21 21:33:23] (step=0983200) Train Loss: 0.1461, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 21:33:32] (step=0983300) Train Loss: 0.1458, Train Steps/Sec: 10.18, Grad Norm: 0.0060 +[2025-02-21 21:33:41] (step=0983400) Train Loss: 0.1448, Train Steps/Sec: 11.77, Grad Norm: 0.0061 +[2025-02-21 21:33:49] (step=0983500) Train Loss: 0.1455, Train Steps/Sec: 11.73, Grad Norm: 0.0064 +[2025-02-21 21:33:59] (step=0983600) Train Loss: 0.1443, Train Steps/Sec: 10.66, Grad Norm: 0.0062 +[2025-02-21 21:34:08] (step=0983700) Train Loss: 0.1466, Train Steps/Sec: 11.32, Grad Norm: 0.0064 +[2025-02-21 21:34:17] (step=0983800) Train Loss: 0.1445, Train Steps/Sec: 10.64, Grad Norm: 0.0062 +[2025-02-21 21:34:25] (step=0983900) Train Loss: 0.1465, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:34:33] (step=0984000) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 21:34:41] (step=0984100) Train Loss: 0.1456, Train Steps/Sec: 12.11, Grad Norm: 0.0063 +[2025-02-21 21:34:50] (step=0984200) Train Loss: 0.1472, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 21:34:58] (step=0984300) Train Loss: 0.1466, Train Steps/Sec: 12.14, Grad Norm: 0.0058 +[2025-02-21 21:35:06] (step=0984400) Train Loss: 0.1459, Train Steps/Sec: 12.18, Grad Norm: 0.0062 +[2025-02-21 21:35:15] (step=0984500) Train Loss: 0.1454, Train Steps/Sec: 10.90, Grad Norm: 0.0069 +[2025-02-21 21:35:24] (step=0984600) Train Loss: 0.1448, Train Steps/Sec: 11.20, Grad Norm: 0.0059 +[2025-02-21 21:35:32] (step=0984700) Train Loss: 0.1476, Train Steps/Sec: 12.53, Grad Norm: 0.0066 +[2025-02-21 21:35:40] (step=0984800) Train Loss: 0.1451, Train Steps/Sec: 12.52, Grad Norm: 0.0061 +[2025-02-21 21:35:48] (step=0984900) Train Loss: 0.1458, Train Steps/Sec: 12.56, Grad Norm: 0.0061 +[2025-02-21 21:35:56] (step=0985000) Train Loss: 0.1458, Train Steps/Sec: 12.55, Grad Norm: 0.0060 +[2025-02-21 21:36:04] (step=0985100) Train Loss: 0.1450, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 21:36:12] (step=0985200) Train Loss: 0.1454, Train Steps/Sec: 12.19, Grad Norm: 0.0062 +[2025-02-21 21:36:20] (step=0985300) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:36:29] (step=0985400) Train Loss: 0.1465, Train Steps/Sec: 11.40, Grad Norm: 0.0057 +[2025-02-21 21:36:38] (step=0985500) Train Loss: 0.1466, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 21:36:46] (step=0985600) Train Loss: 0.1462, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 21:36:55] (step=0985700) Train Loss: 0.1458, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 21:37:05] (step=0985800) Train Loss: 0.1463, Train Steps/Sec: 9.34, Grad Norm: 0.0059 +[2025-02-21 21:37:14] (step=0985900) Train Loss: 0.1460, Train Steps/Sec: 11.70, Grad Norm: 0.0065 +[2025-02-21 21:37:23] (step=0986000) Train Loss: 0.1446, Train Steps/Sec: 10.56, Grad Norm: 0.0062 +[2025-02-21 21:37:31] (step=0986100) Train Loss: 0.1461, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 21:37:39] (step=0986200) Train Loss: 0.1464, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:37:47] (step=0986300) Train Loss: 0.1456, Train Steps/Sec: 12.18, Grad Norm: 0.0065 +[2025-02-21 21:37:56] (step=0986400) Train Loss: 0.1461, Train Steps/Sec: 11.70, Grad Norm: 0.0061 +[2025-02-21 21:38:05] (step=0986500) Train Loss: 0.1466, Train Steps/Sec: 11.68, Grad Norm: 0.0059 +[2025-02-21 21:38:12] (step=0986600) Train Loss: 0.1454, Train Steps/Sec: 12.57, Grad Norm: 0.0061 +[2025-02-21 21:38:22] (step=0986700) Train Loss: 0.1443, Train Steps/Sec: 10.93, Grad Norm: 0.0058 +[2025-02-21 21:38:30] (step=0986800) Train Loss: 0.1453, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 21:38:38] (step=0986900) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:38:45] (step=0987000) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:38:54] (step=0987100) Train Loss: 0.1445, Train Steps/Sec: 11.14, Grad Norm: 0.0060 +[2025-02-21 21:39:02] (step=0987200) Train Loss: 0.1464, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 21:39:11] (step=0987300) Train Loss: 0.1456, Train Steps/Sec: 12.12, Grad Norm: 0.0061 +[2025-02-21 21:39:19] (step=0987400) Train Loss: 0.1467, Train Steps/Sec: 12.61, Grad Norm: 0.0059 +[2025-02-21 21:39:27] (step=0987500) Train Loss: 0.1457, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 21:39:36] (step=0987600) Train Loss: 0.1473, Train Steps/Sec: 11.35, Grad Norm: 0.0060 +[2025-02-21 21:39:44] (step=0987700) Train Loss: 0.1449, Train Steps/Sec: 11.80, Grad Norm: 0.0059 +[2025-02-21 21:39:52] (step=0987800) Train Loss: 0.1447, Train Steps/Sec: 12.60, Grad Norm: 0.0060 +[2025-02-21 21:40:01] (step=0987900) Train Loss: 0.1450, Train Steps/Sec: 11.31, Grad Norm: 0.0061 +[2025-02-21 21:40:11] (step=0988000) Train Loss: 0.1459, Train Steps/Sec: 10.64, Grad Norm: 0.0063 +[2025-02-21 21:40:19] (step=0988100) Train Loss: 0.1449, Train Steps/Sec: 11.68, Grad Norm: 0.0060 +[2025-02-21 21:40:29] (step=0988200) Train Loss: 0.1450, Train Steps/Sec: 10.62, Grad Norm: 0.0063 +[2025-02-21 21:40:37] (step=0988300) Train Loss: 0.1469, Train Steps/Sec: 11.27, Grad Norm: 0.0062 +[2025-02-21 21:40:45] (step=0988400) Train Loss: 0.1466, Train Steps/Sec: 12.56, Grad Norm: 0.0063 +[2025-02-21 21:40:54] (step=0988500) Train Loss: 0.1446, Train Steps/Sec: 12.14, Grad Norm: 0.0063 +[2025-02-21 21:41:02] (step=0988600) Train Loss: 0.1451, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 21:41:11] (step=0988700) Train Loss: 0.1454, Train Steps/Sec: 11.71, Grad Norm: 0.0059 +[2025-02-21 21:41:19] (step=0988800) Train Loss: 0.1460, Train Steps/Sec: 12.07, Grad Norm: 0.0060 +[2025-02-21 21:41:28] (step=0988900) Train Loss: 0.1457, Train Steps/Sec: 11.29, Grad Norm: 0.0063 +[2025-02-21 21:41:36] (step=0989000) Train Loss: 0.1450, Train Steps/Sec: 12.58, Grad Norm: 0.0059 +[2025-02-21 21:41:44] (step=0989100) Train Loss: 0.1466, Train Steps/Sec: 12.59, Grad Norm: 0.0063 +[2025-02-21 21:41:52] (step=0989200) Train Loss: 0.1454, Train Steps/Sec: 12.59, Grad Norm: 0.0061 +[2025-02-21 21:42:00] (step=0989300) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:42:08] (step=0989400) Train Loss: 0.1463, Train Steps/Sec: 12.18, Grad Norm: 0.0060 +[2025-02-21 21:42:16] (step=0989500) Train Loss: 0.1444, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:42:25] (step=0989600) Train Loss: 0.1458, Train Steps/Sec: 10.54, Grad Norm: 0.0057 +[2025-02-21 21:42:34] (step=0989700) Train Loss: 0.1477, Train Steps/Sec: 11.74, Grad Norm: 0.0063 +[2025-02-21 21:42:43] (step=0989800) Train Loss: 0.1450, Train Steps/Sec: 11.39, Grad Norm: 0.0060 +[2025-02-21 21:42:50] (step=0989900) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:42:58] (step=0990000) Train Loss: 0.1444, Train Steps/Sec: 12.62, Grad Norm: 0.0059 +[2025-02-21 21:43:07] (step=0990100) Train Loss: 0.1461, Train Steps/Sec: 11.33, Grad Norm: 0.0063 +[2025-02-21 21:43:17] (step=0990200) Train Loss: 0.1454, Train Steps/Sec: 10.34, Grad Norm: 0.0060 +[2025-02-21 21:43:25] (step=0990300) Train Loss: 0.1457, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 21:43:35] (step=0990400) Train Loss: 0.1445, Train Steps/Sec: 10.95, Grad Norm: 0.0064 +[2025-02-21 21:43:42] (step=0990500) Train Loss: 0.1464, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 21:43:50] (step=0990600) Train Loss: 0.1467, Train Steps/Sec: 12.62, Grad Norm: 0.0063 +[2025-02-21 21:43:59] (step=0990700) Train Loss: 0.1467, Train Steps/Sec: 12.20, Grad Norm: 0.0062 +[2025-02-21 21:44:08] (step=0990800) Train Loss: 0.1441, Train Steps/Sec: 10.21, Grad Norm: 0.0061 +[2025-02-21 21:44:17] (step=0990900) Train Loss: 0.1448, Train Steps/Sec: 12.15, Grad Norm: 0.0059 +[2025-02-21 21:44:25] (step=0991000) Train Loss: 0.1449, Train Steps/Sec: 12.13, Grad Norm: 0.0059 +[2025-02-21 21:44:34] (step=0991100) Train Loss: 0.1461, Train Steps/Sec: 11.25, Grad Norm: 0.0060 +[2025-02-21 21:44:42] (step=0991200) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:44:50] (step=0991300) Train Loss: 0.1463, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 21:44:58] (step=0991400) Train Loss: 0.1441, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 21:45:05] (step=0991500) Train Loss: 0.1462, Train Steps/Sec: 12.61, Grad Norm: 0.0058 +[2025-02-21 21:45:14] (step=0991600) Train Loss: 0.1444, Train Steps/Sec: 12.17, Grad Norm: 0.0060 +[2025-02-21 21:45:22] (step=0991700) Train Loss: 0.1464, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 21:45:30] (step=0991800) Train Loss: 0.1438, Train Steps/Sec: 11.37, Grad Norm: 0.0060 +[2025-02-21 21:45:39] (step=0991900) Train Loss: 0.1453, Train Steps/Sec: 11.72, Grad Norm: 0.0063 +[2025-02-21 21:45:47] (step=0992000) Train Loss: 0.1449, Train Steps/Sec: 11.80, Grad Norm: 0.0061 +[2025-02-21 21:45:56] (step=0992100) Train Loss: 0.1464, Train Steps/Sec: 11.28, Grad Norm: 0.0061 +[2025-02-21 21:46:05] (step=0992200) Train Loss: 0.1474, Train Steps/Sec: 12.13, Grad Norm: 0.0063 +[2025-02-21 21:46:13] (step=0992300) Train Loss: 0.1463, Train Steps/Sec: 11.72, Grad Norm: 0.0059 +[2025-02-21 21:46:23] (step=0992400) Train Loss: 0.1457, Train Steps/Sec: 9.98, Grad Norm: 0.0065 +[2025-02-21 21:46:32] (step=0992500) Train Loss: 0.1459, Train Steps/Sec: 11.38, Grad Norm: 0.0062 +[2025-02-21 21:46:40] (step=0992600) Train Loss: 0.1462, Train Steps/Sec: 11.73, Grad Norm: 0.0060 +[2025-02-21 21:46:48] (step=0992700) Train Loss: 0.1448, Train Steps/Sec: 12.64, Grad Norm: 0.0062 +[2025-02-21 21:46:56] (step=0992800) Train Loss: 0.1453, Train Steps/Sec: 12.62, Grad Norm: 0.0062 +[2025-02-21 21:47:04] (step=0992900) Train Loss: 0.1456, Train Steps/Sec: 12.17, Grad Norm: 0.0063 +[2025-02-21 21:47:13] (step=0993000) Train Loss: 0.1460, Train Steps/Sec: 11.75, Grad Norm: 0.0059 +[2025-02-21 21:47:22] (step=0993100) Train Loss: 0.1451, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 21:47:30] (step=0993200) Train Loss: 0.1447, Train Steps/Sec: 11.76, Grad Norm: 0.0061 +[2025-02-21 21:47:39] (step=0993300) Train Loss: 0.1471, Train Steps/Sec: 10.57, Grad Norm: 0.0065 +[2025-02-21 21:47:47] (step=0993400) Train Loss: 0.1456, Train Steps/Sec: 12.58, Grad Norm: 0.0058 +[2025-02-21 21:47:55] (step=0993500) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 21:48:03] (step=0993600) Train Loss: 0.1441, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:48:12] (step=0993700) Train Loss: 0.1458, Train Steps/Sec: 12.15, Grad Norm: 0.0063 +[2025-02-21 21:48:19] (step=0993800) Train Loss: 0.1458, Train Steps/Sec: 12.61, Grad Norm: 0.0063 +[2025-02-21 21:48:28] (step=0993900) Train Loss: 0.1465, Train Steps/Sec: 11.74, Grad Norm: 0.0059 +[2025-02-21 21:48:36] (step=0994000) Train Loss: 0.1457, Train Steps/Sec: 11.75, Grad Norm: 0.0060 +[2025-02-21 21:48:45] (step=0994100) Train Loss: 0.1463, Train Steps/Sec: 11.40, Grad Norm: 0.0064 +[2025-02-21 21:48:53] (step=0994200) Train Loss: 0.1444, Train Steps/Sec: 12.60, Grad Norm: 0.0059 +[2025-02-21 21:49:01] (step=0994300) Train Loss: 0.1441, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:49:09] (step=0994400) Train Loss: 0.1466, Train Steps/Sec: 12.12, Grad Norm: 0.0059 +[2025-02-21 21:49:18] (step=0994500) Train Loss: 0.1448, Train Steps/Sec: 11.34, Grad Norm: 0.0061 +[2025-02-21 21:49:29] (step=0994600) Train Loss: 0.1459, Train Steps/Sec: 9.40, Grad Norm: 0.0061 +[2025-02-21 21:49:38] (step=0994700) Train Loss: 0.1466, Train Steps/Sec: 10.92, Grad Norm: 0.0062 +[2025-02-21 21:49:46] (step=0994800) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 21:49:54] (step=0994900) Train Loss: 0.1454, Train Steps/Sec: 12.61, Grad Norm: 0.0062 +[2025-02-21 21:50:02] (step=0995000) Train Loss: 0.1451, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 21:50:10] (step=0995100) Train Loss: 0.1458, Train Steps/Sec: 11.78, Grad Norm: 0.0070 +[2025-02-21 21:50:19] (step=0995200) Train Loss: 0.1462, Train Steps/Sec: 11.71, Grad Norm: 0.0064 +[2025-02-21 21:50:27] (step=0995300) Train Loss: 0.1465, Train Steps/Sec: 12.17, Grad Norm: 0.0059 +[2025-02-21 21:50:36] (step=0995400) Train Loss: 0.1470, Train Steps/Sec: 11.76, Grad Norm: 0.0062 +[2025-02-21 21:50:44] (step=0995500) Train Loss: 0.1451, Train Steps/Sec: 11.69, Grad Norm: 0.0065 +[2025-02-21 21:50:52] (step=0995600) Train Loss: 0.1462, Train Steps/Sec: 12.60, Grad Norm: 0.0062 +[2025-02-21 21:51:00] (step=0995700) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 21:51:09] (step=0995800) Train Loss: 0.1461, Train Steps/Sec: 10.93, Grad Norm: 0.0063 +[2025-02-21 21:51:17] (step=0995900) Train Loss: 0.1447, Train Steps/Sec: 12.55, Grad Norm: 0.0062 +[2025-02-21 21:51:25] (step=0996000) Train Loss: 0.1457, Train Steps/Sec: 12.08, Grad Norm: 0.0058 +[2025-02-21 21:51:34] (step=0996100) Train Loss: 0.1453, Train Steps/Sec: 11.32, Grad Norm: 0.0061 +[2025-02-21 21:51:43] (step=0996200) Train Loss: 0.1454, Train Steps/Sec: 11.39, Grad Norm: 0.0061 +[2025-02-21 21:51:51] (step=0996300) Train Loss: 0.1446, Train Steps/Sec: 12.60, Grad Norm: 0.0065 +[2025-02-21 21:51:59] (step=0996400) Train Loss: 0.1456, Train Steps/Sec: 12.60, Grad Norm: 0.0061 +[2025-02-21 21:52:07] (step=0996500) Train Loss: 0.1458, Train Steps/Sec: 12.59, Grad Norm: 0.0060 +[2025-02-21 21:52:15] (step=0996600) Train Loss: 0.1465, Train Steps/Sec: 11.70, Grad Norm: 0.0059 +[2025-02-21 21:52:25] (step=0996700) Train Loss: 0.1456, Train Steps/Sec: 10.67, Grad Norm: 0.0060 +[2025-02-21 21:52:34] (step=0996800) Train Loss: 0.1466, Train Steps/Sec: 11.31, Grad Norm: 0.0062 +[2025-02-21 21:52:42] (step=0996900) Train Loss: 0.1457, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 21:52:51] (step=0997000) Train Loss: 0.1463, Train Steps/Sec: 12.14, Grad Norm: 0.0061 +[2025-02-21 21:53:00] (step=0997100) Train Loss: 0.1456, Train Steps/Sec: 11.29, Grad Norm: 0.0061 +[2025-02-21 21:53:07] (step=0997200) Train Loss: 0.1466, Train Steps/Sec: 12.61, Grad Norm: 0.0061 +[2025-02-21 21:53:16] (step=0997300) Train Loss: 0.1455, Train Steps/Sec: 11.76, Grad Norm: 0.0060 +[2025-02-21 21:53:25] (step=0997400) Train Loss: 0.1459, Train Steps/Sec: 11.67, Grad Norm: 0.0060 +[2025-02-21 21:53:33] (step=0997500) Train Loss: 0.1453, Train Steps/Sec: 12.15, Grad Norm: 0.0062 +[2025-02-21 21:53:41] (step=0997600) Train Loss: 0.1459, Train Steps/Sec: 11.74, Grad Norm: 0.0062 +[2025-02-21 21:53:50] (step=0997700) Train Loss: 0.1457, Train Steps/Sec: 11.69, Grad Norm: 0.0066 +[2025-02-21 21:53:58] (step=0997800) Train Loss: 0.1456, Train Steps/Sec: 12.63, Grad Norm: 0.0058 +[2025-02-21 21:54:06] (step=0997900) Train Loss: 0.1445, Train Steps/Sec: 12.62, Grad Norm: 0.0060 +[2025-02-21 21:54:14] (step=0998000) Train Loss: 0.1452, Train Steps/Sec: 12.21, Grad Norm: 0.0059 +[2025-02-21 21:54:22] (step=0998100) Train Loss: 0.1462, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 21:54:30] (step=0998200) Train Loss: 0.1470, Train Steps/Sec: 11.77, Grad Norm: 0.0066 +[2025-02-21 21:54:40] (step=0998300) Train Loss: 0.1456, Train Steps/Sec: 10.01, Grad Norm: 0.0062 +[2025-02-21 21:54:48] (step=0998400) Train Loss: 0.1459, Train Steps/Sec: 12.18, Grad Norm: 0.0059 +[2025-02-21 21:54:56] (step=0998500) Train Loss: 0.1443, Train Steps/Sec: 12.59, Grad Norm: 0.0062 +[2025-02-21 21:55:04] (step=0998600) Train Loss: 0.1449, Train Steps/Sec: 12.62, Grad Norm: 0.0064 +[2025-02-21 21:55:12] (step=0998700) Train Loss: 0.1458, Train Steps/Sec: 12.63, Grad Norm: 0.0065 +[2025-02-21 21:55:21] (step=0998800) Train Loss: 0.1450, Train Steps/Sec: 11.72, Grad Norm: 0.0061 +[2025-02-21 21:55:31] (step=0998900) Train Loss: 0.1471, Train Steps/Sec: 10.32, Grad Norm: 0.0061 +[2025-02-21 21:55:39] (step=0999000) Train Loss: 0.1462, Train Steps/Sec: 11.74, Grad Norm: 0.0061 +[2025-02-21 21:55:48] (step=0999100) Train Loss: 0.1456, Train Steps/Sec: 11.00, Grad Norm: 0.0062 +[2025-02-21 21:55:56] (step=0999200) Train Loss: 0.1460, Train Steps/Sec: 12.62, Grad Norm: 0.0058 +[2025-02-21 21:56:04] (step=0999300) Train Loss: 0.1453, Train Steps/Sec: 12.63, Grad Norm: 0.0059 +[2025-02-21 21:56:12] (step=0999400) Train Loss: 0.1469, Train Steps/Sec: 12.63, Grad Norm: 0.0061 +[2025-02-21 21:56:20] (step=0999500) Train Loss: 0.1456, Train Steps/Sec: 11.79, Grad Norm: 0.0060 +[2025-02-21 21:56:30] (step=0999600) Train Loss: 0.1450, Train Steps/Sec: 10.53, Grad Norm: 0.0059 +[2025-02-21 21:56:38] (step=0999700) Train Loss: 0.1445, Train Steps/Sec: 12.18, Grad Norm: 0.0064 +[2025-02-21 21:56:47] (step=0999800) Train Loss: 0.1452, Train Steps/Sec: 11.33, Grad Norm: 0.0059 +[2025-02-21 21:56:55] (step=0999900) Train Loss: 0.1459, Train Steps/Sec: 12.15, Grad Norm: 0.0060 +[2025-02-21 21:57:03] (step=1000000) Train Loss: 0.1455, Train Steps/Sec: 12.62, Grad Norm: 0.0061 +[2025-02-21 21:57:07] Saved checkpoint to ../logs/smo/diff_b_1000kx1024_qknorm/checkpoints/1000000.pt +[2025-02-21 22:08:06] Calculating FID with 50000 number of samples +[2025-02-21 22:08:49] (step=1000000), Fid=20.646203280517113 +[2025-02-21 22:08:49] Done!